In [14]:
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from langchain_google_genai import ChatGoogleGenerativeAI
import time


In [None]:
# API Credentials (Replace with your credentials)
CLIENT_ID = "MY_API_KEY"
API_URL = "https://api.myanimelist.net/v2/anime"
Fields = "id,title,alternative_titles,synopsis,genres,mean,rank,popularity,media_type,status"

In [5]:
def fetch_anime_list(offset=0):
    headers = {"X-MAL-Client-ID": CLIENT_ID}
    params = {"ranking_type": "all", "limit": 100, "offset": offset, "fields": Fields}
    response = requests.get(f"{API_URL}/ranking", headers=headers, params=params)
    if response.status_code == 200:
        return response.json().get("data", [])
    else:
        print(f"Error {response.status_code}: {response.text}")
        return []

In [6]:
def preprocess_data(anime_list, existing_df):
    # Create a DataFrame from the MAL API response
    new_data = pd.DataFrame([
        {
            "id": anime["node"]["id"],  # Unique MAL ID
            "title": anime["node"]["title"],  # Anime title
            "alternative_titles": anime["node"]["title"],  # Anime title
            "synopsis": anime["node"].get("synopsis", ""),  # Synopsis (if available)
            "genres": ", ".join([g["name"] for g in anime["node"].get("genres", [])]),  # Genres as a comma-separated string
            "mean": anime["node"].get("mean", 0),  # Average score, defaulting to 0 if missing
            "media_type": anime["node"].get("media_type", "unknown"),  # Type (TV, movie, OVA, etc.)
            "status": anime["node"].get("status", "unknown")  # Status (e.g., finished_airing, airing)
        }
        for anime in anime_list
    ])
    # Merge with existing data if provided
    if existing_df is not None:
        new_data = pd.concat([existing_df, new_data], ignore_index=True)
    return(new_data)

In [7]:
df = None 
for i in range(0, 21525, 100):
    anime_list = fetch_anime_list(i)
    df = preprocess_data(anime_list,df)

KeyboardInterrupt: 

In [136]:
df.to_csv('animelist2.csv', index=False)  

In [None]:
# Configure Gemini API
GEMINI_API_KEY = 'MY_API_KEY'
GEMINI_MODEL = "gemini-1.5-flash"
llm = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3)

def extract_base_title_with_gemini(title):
    """
    Uses Gemini to intelligently parse and clean anime titles.
    """
    prompt = (f"Simplify the anime title: '{title}'. Remove season numbers, 'Movie', 'Special', 'OVA', and any similar identifiers. Return only the base title.")
    try:
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return title  # Fallback in case of an error    

In [None]:
# You Have two options when running this code: Either Get A Premium Key or Brute Force it By doing it in chunks of 97 per chunk
for index, row in df.iloc[0:].iterrows():
    #time.sleep(2)
    df.at[index, "title"] = extract_base_title_with_gemini(row["title"])


In [16]:
df.to_csv('animelist1.csv', index=False)  

In [17]:
df1 = pd.read_csv("animelist.csv", encoding="latin1")
# Transform it into the desired format
df = pd.DataFrame([
    {
        "id": row["id"],  # Assuming the CSV has an "id" column
        "title": row["title"],  # Assuming the CSV has a "title" column
        "alternative_titles": row["alternative_titles"],  # If missing, set default
        "synopsis": row.get("synopsis", ""),  # Ensure default empty string if missing
        "genres": row["genres"],  # Convert '|' separated genres into ', '
        "mean": row.get("mean", 0),  # Ensure default value
        "media_type": row.get("media_type", "unknown"),  # Default if missing
        "status": row.get("status", "unknown")  # Default if missing
    }
    for _, row in df1.iterrows()
])


In [127]:
def combine_copies(new_data):
    # Convert 'mean' to numeric, coercing errors to NaN
    new_data['mean'] = pd.to_numeric(new_data['mean'], errors='coerce')
    
    # Helper function to combine text fields by concatenating unique, non-empty entries.
    def combine_text(series):
        unique_entries = series.dropna().astype(str).str.strip()
        unique_entries = unique_entries[unique_entries != ""].unique()
        return " ".join(unique_entries)
    
    # Helper function to combine genres by splitting, deduplicating and rejoining.
    def combine_genres(series):
        genres_set = set()
        for entry in series.dropna():
            # Assume genres are separated by comma (after converting from '|')
            for genre in str(entry).split(","):
                genre = genre.strip()
                if genre:
                    genres_set.add(genre)
        return ", ".join(sorted(genres_set))
    
    # Group by title and combine the rows using aggregation functions.
    combined = new_data.groupby("title", as_index=False).agg({
        "id": lambda x: ", ".join(x.astype(str).unique()),
        "alternative_titles": combine_text,
        "synopsis": combine_text,
        "genres": combine_genres,
        "mean": "mean",
        "media_type": lambda x: ", ".join(x.dropna().unique()),
        "status": lambda x: ", ".join(x.dropna().unique())
    })
    
    return combined

In [128]:
df = combine_copies(df)

In [143]:
def compute_similarity(df):
    # Combine the desired fields into a single text string for each row
    def combine_fields(row):
        # Convert each field to a string, applying defaults as needed
        title = str(row.get("title", ""))
        alternative_titles = str(row.get("alternative_titles", ""))
        synopsis = str(row.get("synopsis", ""))
        # Convert '|' separated genres to a comma-separated string
        genres = str(row.get("genres", "")).replace("|", ", ")
        # Convert numeric mean to string, defaulting to 0 if missing
        mean = str(row.get("mean", 0))
        media_type = str(row.get("media_type", "unknown"))
        status = str(row.get("status", "unknown"))
        # Combine all fields into one string
        return " ".join([title, alternative_titles, synopsis, genres, mean, media_type, status])
    
    # Create a new column 'combined' that contains the merged text from all fields
    df["combined"] = df.apply(combine_fields, axis=1)
    
    # Use TfidfVectorizer on the combined text
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["combined"])
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return similarity_matrix


In [144]:
similarity_matrix = compute_similarity(df)

In [157]:
def recommend_anime(user_favorites, df, similarity_matrix):
    recommended = set()
    for anime_title in user_favorites:
        if anime_title in df["title"].values:
            # Set na=False to handle NaN values in the title column
            idx = df[df["title"].str.contains(anime_title, na=False)].index[0]
            similar_indices = similarity_matrix[idx].argsort()[-6:-1][::-1]
            recommended.update(df.iloc[similar_indices]["title"].values)
    return list(recommended)

In [108]:
def get_anime_titles():
    return df["title"].tolist()

In [109]:
def recommend(user_favorites):
    recommendations = recommend_anime(user_favorites, df, similarity_matrix)
    return "\n".join(recommendations)


In [110]:
def add_to_list(selected_anime, user_list):
    if selected_anime and selected_anime not in user_list:
        user_list.append(selected_anime)
    return user_list

In [111]:
def clear_list():
    return []

In [147]:
with gr.Blocks() as demo:
    gr.Markdown("# Anime Recommendation System")
    
    anime_dropdown = gr.Dropdown(get_anime_titles(), label="Select an anime to add to your list")
    user_list = gr.State([])  # Holds the list of selected anime
    anime_list_display = gr.Textbox(label="Your Anime List", interactive=False)
    
    add_button = gr.Button("Add to List")
    clear_button = gr.Button("Clear List")
    
    recommend_button = gr.Button("Get Recommendations")
    recommendations_output = gr.Textbox(label="Recommended Anime")
    
    add_button.click(add_to_list, inputs=[anime_dropdown, user_list], outputs=user_list)
    add_button.click(lambda x: "\n".join(x), inputs=user_list, outputs=anime_list_display)
    
    clear_button.click(clear_list, outputs=user_list)
    clear_button.click(lambda: "", outputs=anime_list_display)
    
    recommend_button.click(recommend, inputs=user_list, outputs=recommendations_output)

In [159]:
if __name__ == "__main__":
    demo.launch()

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----

To create a public link, set `share=True` in `launch()`.
