In [63]:
import pandas as pd
import numpy as np
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

In [68]:
ROOT_DIR = Path.cwd()
data_dir = ROOT_DIR / 'top_anime_dataset_v2.csv'

In [None]:
pd.set_option("display.max.columns", None)
anime_master_df = pd.read_csv(data_dir)
anime_master_df.head()

In [None]:
print(anime_master_df['themes'].isnull().sum())
anime_master_df.info()

In [4]:
anime_master_df.drop_duplicates(inplace = True)

In [None]:
anime_master_df.info()

In [None]:
anime_master_df.columns

In [None]:
anime_master_df['synopsis'][1]

In [6]:
selected_features = ['genres', 'themes', 'demographics', 'synopsis', 'type', 'producers', 'source']

for col in selected_features:
    anime_master_df[col] = anime_master_df[col].fillna('')

In [None]:
anime_master_df.isnull().sum()

In [7]:
anime_master_df['combined_features'] = (
    anime_master_df['genres'] + ' ' +
    anime_master_df['themes'] + ' ' +
    anime_master_df['demographics'] + ' ' +
    anime_master_df['synopsis'] + ' ' +
    anime_master_df['type'] + ' ' +
    anime_master_df['producers'] + ' ' +
    anime_master_df['source']
)

In [8]:
def preprocess(text):
    if not isinstance(text, str):
        return ""
    ps = PorterStemmer()
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n', '', text)
    text = " ".join(ps.stem(word) for word in text.split())
    
    return text

anime_master_df['combined_features'] = anime_master_df['combined_features'].apply(preprocess)

In [9]:
anime_master_df['combined_features'][0]

'adventure, drama, fantasi shounen dure their decade-long quest to defeat the demon king, the member of the hero\' party—himmel himself, the priest heiter, the dwarf warrior eisen, and the elven mage frieren—forg bond through adventur and battles, creat unforgett preciou memori for most of them. however, the time that frieren spend with her comrad is equival to mere a fraction of her life, which ha last over a thousand years. when the parti disband after their victory, frieren casual return to her "usual" routin of collect spell across the continent. due to her differ sens of time, she seemingli hold no strong feel toward the experi she went through. as the year pass, frieren gradual realiz how her day in the hero\' parti truli impact her. wit the death of two of her former companions, frieren begin to regret have taken their presenc for granted; she vow to better understand human and creat real person connections. although the stori of that onc memor journey ha long ended, a new tale 

In [10]:
tf_vec = TfidfVectorizer(stop_words = 'english')
vectorized_features = tf_vec.fit_transform(anime_master_df['combined_features'])

In [11]:
similarity = cosine_similarity(vectorized_features)

In [13]:
movie = input('Movie: ')

Movie:  Bleach


In [14]:
input_idx = anime_master_df[anime_master_df['name'] == movie].index[0].item()
input_idx

724

In [18]:
# def recommend_anime(anime):
#     anime_index = animedf[animedf['name'] == anime].index[0].item()
#     distances = list(enumerate(similarity[anime_index]))
#     sorted_distances = sorted(distances, reverse=True, key=lambda x: x[1])

#     candidates = []
#     for i in sorted_distances[1:9]:
#         idx = i[0]
#         candidates.append({
#             'name': animedf.iloc[idx]['name'],
#             'poster_url': animedf.iloc[idx]['image_url'],
#             'anime_url': animedf.iloc[idx]['anime_url'],
#             'score': animedf.iloc[idx]['score'],
#         })
    
#     candidates = sorted(candidates, reverse=True, key=lambda x: x['score'])

#     recommended_anime_names = [c['name'] for c in candidates]

#     return recommended_anime_names

In [None]:
recommend_anime('One Piece Fan Letter')

In [29]:
anime_dict = {}
for i in range(similarity.shape[0]):
    top_idx = np.argsort(similarity[i])[-11:][::-1]
    top_idx = [idx for idx in top_idx if idx != i]
    anime_dict[i] = top_idx[:10]
    
recs_df = pd.DataFrame.from_dict(anime_dict, orient = 'index')

In [30]:
recs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2539,3416,754,1886,5257,935,14187,3104,4804,3393
1,525,2039,3187,5912,675,3027,6009,8214,14893,12905
2,126,2309,328,170,295,14629,8784,3057,20,9712
3,359,512,1021,915,54,416,10323,2315,607,6593
4,792,36,133,46,25,115,2415,86,1071,3507


In [46]:
def recommend(anime):
    anime_index = animedf[animedf['name'] == anime].index[0]
    neighbor_ids = recs_df.iloc[anime_index].dropna().astype(int).tolist()

    candidates = []
    for idx in neighbor_ids:
        candidates.append(
            {
                'name': animedf.iloc[idx]['name'],
                'poster_url': animedf.iloc[idx]['image_url'],
                'anime_url': animedf.iloc[idx]['anime_url'],
                'score': animedf.iloc[idx]['score']
            }
        )
    names = [c['name'] for c in candidates]

    return names

In [47]:
recommend('Fullmetal Alchemist: Brotherhood')

['Fullmetal Alchemist',
 'Fullmetal Alchemist: The Conqueror of Shamballa',
 'Fullmetal Alchemist: The Sacred Star of Milos',
 'Fullmetal Alchemist: The Sacred Star of Milos Specials',
 'Fullmetal Alchemist: Brotherhood Specials',
 'Fullmetal Alchemist: Premium Collection',
 'Kishin Taisen Gigantic Formula',
 'Yoroiden Samurai Troopers Kikoutei Densetsu',
 'Otona no Bouguya-san',
 'Garo: Guren no Tsuki']

In [61]:
recs_df.to_csv('artifacts/similarity_matrix.csv', index = False)

In [17]:
anime_cols = ['anime_id', 'anime_url', 'image_url', 'name', 'score', 'themes', 'demographics', 'synopsis', 'type', 'episodes', 'producers', 'source', 'combined_features']
animedf = anime_master_df[anime_cols]

In [60]:
animedf.to_csv('artifacts/anime_data.csv', index = False)

In [None]:
# import numpy as np
# np.save("artifacts/similarity_matrix.npy", similarity.astype("float32"))

In [40]:
import requests
import pandas as pd
url = 'https://api.jikan.moe/v4/seasons/now?limit=25'

response = requests.get(url)
data = response.json()
data['data']

new_anime_list = []
for item in data['data']:
    new_anime_list.append({
        'anime_id': item['mal_id'],
        'anime_url': item['url'],
        'image_url': item['images']['jpg']['image_url'],
        'name': item['title'],
        'score': item.get('score', None),
        'themes': [t["name"] for t in item.get("themes", [])],
        'demographics': [d["name"] for d in item.get("demographics", [])],
        'synopsis': item.get('synopsis', ''),
        'type': item.get('type', ''),
        'episodes': item.get("episodes", None),
        'producers': [p["name"] for p in item.get("producers", [])],
        'source': item.get("source", ""),
        'combined_features': None
    })

new_df = pd.DataFrame(new_anime_list)

In [None]:
new_df = new_df.sort_values(by = 'score', ascending = False).head()
new_df

In [None]:
new_df.to_csv('artifacts/trending_df.csv', index = False)

In [None]:
animedf.head()