In [4]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
data = pd.read_csv('anime_with_synopsis.csv')
data["description"] = data["Genres"] + data["sypnopsis"]
print(data.shape)
display(data.head())

(16214, 6)


Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis,description
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, Sci-Fi, Spac..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, Sci-Fi, Spaceother day..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, Sci-Fi, Adventure, Comedy, Drama, Shou..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, SupernaturalIt is..."


### TF
TF-IDF stands for Term Frequency-Inverse Document Frequency and is a commonly used method for determining the importance of words in a text document. 

The term frequency (TF) of a word is simply the number of times it appears in a document, normalized by the total number of words in the document: 
$$\mathrm{tf}{i,j} = \frac{n{i,j}}{\sum_k n_{k,j}}$$ 
where $n_{i,j}$ is the number of occurrences of word $i$ in document $j$. 

### IDF
The inverse document frequency (IDF) of a word is a measure of how important it is in the corpus as a whole. It is calculated as the logarithm of the total number of documents in the corpus divided by the number of documents in which the word appears: 
$$\mathrm{idf}i = \log\frac{N}{df_i}$$
where $N$ is the total number of documents in the corpus and $df_i$ is the number of documents in which word $i$ appears. 

### TF-IDF
The TF-IDF score for a word in a document is then the product of its term frequency and its inverse document frequency: 
$$\mathrm{tfidf}{i,j} = \mathrm{tf}_{i,j} \cdot \mathrm{idf}_i$$
TF-IDF is useful for identifying words that are important to a particular document, and can be used to represent the document as a vector of TF-IDF scores for each word. This vector can then be used in a content-based recommender system to find similar documents based on their TF-IDF vectors.

In [18]:
# Preprocess data
data["description"] = data["description"].fillna("")
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(data["description"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)
print(cosine_sim.shape)

[[1.         0.23140044 0.02657159 ... 0.         0.0060251  0.0213021 ]
 [0.23140044 1.         0.04508836 ... 0.01092745 0.0073287  0.01645327]
 [0.02657159 0.04508836 1.         ... 0.         0.0191878  0.00171378]
 ...
 [0.         0.01092745 0.         ... 1.         0.         0.        ]
 [0.0060251  0.0073287  0.0191878  ... 0.         1.         0.        ]
 [0.0213021  0.01645327 0.00171378 ... 0.         0.         1.        ]]
(16214, 16214)


In [19]:
def get_recommendations(title, cosine_sim=cosine_sim, data=data):
    # Get index of movie with given title
    idx = data[data["Name"] == title].index[0]

    # Get pairwise similarities of all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 10 most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return data["Name"].iloc[movie_indices]

# Example usage
for i in range(10):
    rec = get_recommendations(data["Name"].values[i]).values
    print(f"most similar movies to {data['Name'].values[i]} are: {', '.join(rec)}")
    print()


most similar movies to Cowboy Bebop are: Cowboy Bebop: Tengoku no Tobira, Ginga Senpuu Braiger, Seihou Bukyou Outlaw Star, Cowboy Bebop: Yose Atsume Blues, Cowboy Bebop: Ein no Natsuyasumi, Sol Bianca: Taiyou no Fune, Wrestler Gundan Seisenshi Robin Jr., Odin: Koushi Hansen Starlight, Happening Star ☆, Bounty Dog: Getsumen no Ibu

most similar movies to Cowboy Bebop: Tengoku no Tobira are: Cowboy Bebop, Cowboy Bebop: Ein no Natsuyasumi, Ushinawareta Choushoku, Cowboy Bebop: Yose Atsume Blues, Iria: Zeiram The Animation, Bounty Hunter: The Hard, One Piece Film: Gold, Tetsuwan Atom to Sagurou! Dosei wo Mawaru Shinpi no Hoshi Titan, Ladyspo, Shin Seiki Den Mars

most similar movies to Trigun are: Trigun: Badlands Rumble, Isewan Taifuu Monogatari, Zetsumetsu Kigu-shun. (2020), Platonic Chain: Ansatsu Jikkouchuu, Itsumo Kokoro ni Taiyou wo!, Hakaima Sadamitsu, Bonobono (TV 2016), Kukuriraige: Sanseitai Denki, Uobbuchou, Kyoto Animation Koushiki Twitter: Itsumo Arigatou

most similar movies 