In [16]:
import numpy as np 
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
def preprocess_text(text):
    # Ensure that the input is a string
    if isinstance(text, str):
        # Tokenize the text into words
        words = word_tokenize(text)
        
        # Apply re.sub to each word individually
        cleaned_words = [re.sub('[^A-Za-z0-9]+', '', word) for word in words]
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        cleaned_words = [word for word in cleaned_words if word not in stop_words]
        
        # Join the cleaned words with a space
        processed_text = ' '.join(cleaned_words)
        return processed_text
    else:
        return text  # Return the input as is if it's not a string

#cf['sypnopsis'] = cf['sypnopsis'].apply(preprocess_text)
#cf

In [90]:
class Content_Categorical:
    def __init__(self):
        cf = pd.read_csv("anime_with_synopsis.csv")
        cf.drop(["Score"], axis=1, inplace=True)
        cf['Genres'] = cf['Genres'].apply(lambda x: x.replace(",", "").lower())
        cf['sypnopsis'] = cf['sypnopsis'].apply(lambda x: str(x).replace(",", "").lower())
        cf['sypnopsis'] = cf['sypnopsis'].apply(preprocess_text)

        cf['tags'] = cf['Genres'] + cf['sypnopsis']
        cf.drop(['sypnopsis', 'Genres'], axis=1, inplace=True)
        self.cf = cf
        tfidf_vectorizer = TfidfVectorizer(max_features=20000, stop_words='english')
        tfidf_vectors = tfidf_vectorizer.fit_transform(cf['tags'])
        self.similarity = cosine_similarity(tfidf_vectors)
    
    def recommend(self, title):
        anime_id = self.cf[self.cf['Name'] == title]['MAL_ID'].values[0]  # Get MAL_ID

        if anime_id is not None:
            anime_index = self.cf[self.cf['MAL_ID'] == anime_id].index[0]
            distances = self.similarity[anime_index]
            anime_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

#            for i in anime_list:
#                anime_name = self.cf.iloc[i[0]]['Name']
#                print(f'{anime_name}: {i[1]}')  # Print anime name and similarity score

            recommended_anime_data = [(self.cf.iloc[i[0]]['MAL_ID'], i[1]) for i in anime_list]

            return recommended_anime_data
        else:
            print("Anime not found in the dataset")
            return None


In [87]:
df = pd.read_csv("anime.csv")

In [91]:
cc = Content_Categorical()

In [92]:
cc.recommend("Tengen Toppa Gurren Lagann")

Tengen Toppa Gurren Lagann: Ore no Gurren wa Pikka-Pika!!: 0.2695082677460388
Nagisa: 0.26602913315228793
Tengen Toppa Gurren Lagann: Mitee Mono wa Miteen da!!: 0.24622181795535764
Blue Remains: 0.17504351696649048
Nagi no Asu kara: 0.14075403611151388


[(10622, 0.2695082677460388),
 (32776, 0.26602913315228793),
 (3352, 0.24622181795535764),
 (4733, 0.17504351696649048),
 (16067, 0.14075403611151388)]

In [93]:
cc.recommend("Bakemonogatari")

Nisemonogatari: 0.4483851894978049
Nekomonogatari: Kuro: 0.4347018286480494
Owarimonogatari 2nd Season: 0.43256660677276476
Zoku Owarimonogatari: 0.3461945739952347
Kizumonogatari III: Reiketsu-hen: 0.3403874616485157


[(11597, 0.4483851894978049),
 (15689, 0.4347018286480494),
 (35247, 0.43256660677276476),
 (36999, 0.3461945739952347),
 (31758, 0.3403874616485157)]

In [None]:
cc.recommend("Dragon Ball")

(785, 0.35355764771870335)
Dragon Ball Movie 3: Makafushigi Daibouken
(3786, 0.35275410495008563)
Dragon Ball Kai
(193, 0.2946085406346619)
Dragon Ball GT
(458, 0.2922438253326253)
Dragon Ball Movie 1: Shen Long no Densetsu
(716, 0.28544348546307197)
Dragon Ball Z


[(785, 0.35355764771870335),
 (3786, 0.35275410495008563),
 (193, 0.2946085406346619),
 (458, 0.2922438253326253),
 (716, 0.28544348546307197)]