In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

def load_data(csv_file_path):
  # load dataset into a pandas DataFrame
  df = pd.read_csv(csv_file_path)
  
  # making sure dataset is filled
  df['Synopsis'] = df['Synopsis'].fillna('')
  df['Genres'] = df['Genres'].fillna('')
  
  # make numeric score column
  df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
  
  return df

def build_similarity_matrix(df):
  # weights (probably should be changed)
  weight_synopsis = 1.0
  weight_genres = 2.0

  # create TF-IDF Vectorizers using default english stop words
  tfidf_synopsis = TfidfVectorizer(stop_words='english')
  tfidf_genres = TfidfVectorizer(stop_words='english')

  # create the TF-IDF matrix based on combined features from earlier
  tfidf_matrix_synopsis = tfidf_synopsis.fit_transform(df['Synopsis'])
  tfidf_matrix_genres = tfidf_genres.fit_transform(df['Genres'])

  combined_tfidf = hstack([weight_synopsis * tfidf_matrix_synopsis, weight_genres * tfidf_matrix_genres])

  # cosine similarity between all anime in list
  cosine_sim = cosine_similarity(combined_tfidf, combined_tfidf)
  return cosine_sim

def recommend_anime(anime_name, df, cosine_sim, n):
  # mapping anime titles to dataframe indices
  indices = pd.Series(df.index, index=df['Name']).drop_duplicates()
  if anime_name not in indices:
    return f"'{anime_name}' not found"
  
  # get the index corresponding to the anime
  index = indices[anime_name]
  
  # compute similarity scores for this anime with all others
  sim_scores = list(enumerate(cosine_sim[index]))
  
  # combine each similarity score with the normalized anime score (I think multiplying works as intended but lmk if there's another way)
  combined_scores = []
  for i, sim_score in sim_scores:
    # get the anime's score, if missing, treat as 0 (not enough information in the dataset I believe)
    score = df.iloc[i]['Score']
    if pd.isna(score):
      norm_score = 0
    else:
      norm_score = score / 10
    combined = sim_score * norm_score
    combined_scores.append((i, combined))
  
  # sort the anime based on combined score (highest first)
  combined_scores = sorted(combined_scores, key=lambda x: x[1], reverse=True)
  
  # exclude the input anime and take the next n entries
  filtered_scores = [item for item in combined_scores if item[0] != index]
  top_scores = filtered_scores[:n]
  anime_indices = [i[0] for i in top_scores]
  
  # return selected columns for the recommended anime
  return df.iloc[anime_indices][['anime_id', 'Name', 'Score', 'Genres']]

if __name__ == "__main__":
  csv_file_path = './dataset/anime-dataset-2023.csv'
  df = load_data(csv_file_path)
  cosine_sim = build_similarity_matrix(df)
  
  anime_to_recommend = "Lycoris Recoil"
  recommendations = recommend_anime(anime_to_recommend, df, cosine_sim, n=5)
  print("Recommendations for", anime_to_recommend)
  print(recommendations)


Recommendations for Lycoris Recoil
       anime_id                                 Name  Score  Genres
16191     40682                   Kingdom 3rd Season   8.81  Action
21676     50160                   Kingdom 4th Season   8.75  Action
9353      25781  Shingeki no Kyojin: Kuinaki Sentaku   8.41  Action
21530     49918     Boku no Hero Academia 6th Season   8.36  Action
7627      17389                   Kingdom 2nd Season   8.33  Action
