In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# load anime dataset
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating_complete.csv')

In [12]:
anime.drop(['Name', 'Score', 'Genres', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'], axis=1, inplace=True)

# anime.reset_index(inplace=True)
anime.head()

Unnamed: 0,MAL_ID,English name
0,1,Cowboy Bebop
1,5,Cowboy Bebop:The Movie
2,6,Trigun
3,7,Witch Hunter Robin
4,8,Beet the Vandel Buster


In [18]:
anime.column

Index(['MAL_ID', 'English name'], dtype='object')

In [9]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9.0
1,0,1004,5.0
2,0,3010,7.0
3,0,570,7.0
4,0,2762,9.0


In [13]:
rating_anime = rating.merge(anime, left_on='anime_id', right_on='MAL_ID')

In [26]:
rating_anime.head()

Unnamed: 0,user_id,rating,English name,adjusted_rating
0,0,9.0,Fullmetal Alchemist:The Movie - Conqueror of S...,1.6
1,6,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.926045
2,18,10.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.292683
3,19,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.459499
4,33,4.0,Fullmetal Alchemist:The Movie - Conqueror of S...,-2.910448


In [15]:
rating_anime.drop(['MAL_ID'], axis=1, inplace=True)
rating_anime.drop(['anime_id'], axis=1, inplace=True)

In [16]:
rating_anime.head()

Unnamed: 0,user_id,rating,English name
0,0,9.0,Fullmetal Alchemist:The Movie - Conqueror of S...
1,6,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...
2,18,10.0,Fullmetal Alchemist:The Movie - Conqueror of S...
3,19,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...
4,33,4.0,Fullmetal Alchemist:The Movie - Conqueror of S...


In [19]:
user_avg_ratings=rating_anime.groupby('user_id')['rating'].mean()

In [20]:
# calculating adjusted ratings (by subtracting user mean rating from the rating)
# and storing this rating in a separate column
rating_anime['adjusted_rating'] = rating_anime.apply(lambda row: row['rating'] - user_avg_ratings[row['user_id']], axis=1)

In [21]:
rating_anime.head()

Unnamed: 0,user_id,rating,English name,adjusted_rating
0,0,9.0,Fullmetal Alchemist:The Movie - Conqueror of S...,1.6
1,6,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.926045
2,18,10.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.292683
3,19,8.0,Fullmetal Alchemist:The Movie - Conqueror of S...,0.459499
4,33,4.0,Fullmetal Alchemist:The Movie - Conqueror of S...,-2.910448


In [22]:
# creating a pivot table for userIds, movieIds, and adjusted ratings
pivot_table = pd.pivot_table(rating_anime, values='adjusted_rating', index='user_id', columns='English name')

In [27]:
pivot_table.describe()

English name,"""Parade"" de Satie","""Star""t",-OutsideR:RequieM-,.Koni-chan,.hack//G.U. Trilogy,.hack//G.U. Trilogy:Parody Mode,.hack//Gift,.hack//Legend Of The Twilight,.hack//Legend of the Twilight Offline Meeting Special,.hack//Liminality,...,the Garden of sinners Chapter 7:Murder Speculation Part B,the Garden of sinners Chapter 8:The Final Chapter,the Garden of sinners Pre-show Reminder,the Garden of sinners Remix -Gate of seventh heaven-,tsuritama,xxxHOLiC,xxxHOLiC The Movie:A Midsummer Night's Dream,∀ Gundam,∀ Gundam I:Earth Light,∀ Gundam II:Moonlight Butterfly
count,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,...,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0,11576.0
mean,-0.000656,-9e-06,-0.000914,5.9e-05,-0.005074,-0.004371,-0.011637,-0.022321,-0.007393,-0.00995,...,0.094129,-0.011449,-0.005777,0.005777,0.011494,0.038808,0.011724,0.004958,-0.000429,-0.000114
std,0.041238,0.026671,0.04139,0.049738,0.157625,0.118369,0.205956,0.235533,0.149479,0.179198,...,0.47078,0.31193,0.15031,0.182,0.237666,0.33593,0.205724,0.156728,0.048649,0.046619
min,-2.818142,-1.333333,-3.035461,-2.601399,-4.864258,-5.987055,-7.093863,-4.053521,-5.577681,-5.791489,...,-6.395349,-5.8875,-6.678571,-5.239796,-3.577828,-5.909091,-4.019651,-4.748879,-2.179222,-2.179222
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.139608,1.614835,0.173252,3.035545,2.166012,2.166012,2.741782,1.814262,1.628512,2.190476,...,4.743533,3.481013,2.802111,3.334286,4.616781,4.236028,3.448276,3.693593,2.634304,2.634304


In [28]:
# filling NaNs with 0s to enable cosine similarity calculation
pivot_table = pivot_table.fillna(0)

###Need to generate ratings with DNN instead of filling with 0s

In [29]:
# getting corresponding movieId for corresponding input title
input_anime_name=input('Enter movie title: ')

Enter movie title: Cowboy Bebop


In [30]:
# Function to calculate cosine similarity for a single anime.
def calculate_cosine_similarity(input_ratings, other_ratings):
    return cosine_similarity([input_ratings], [other_ratings])[0][0]

In [31]:
# Function to calculate cosine similarity for all anime in the pivot table.
def calculate_anime_similarity(input_movieId, pivot_table):
    input_movie_ratings = pivot_table[input_movieId]

    similarities = [] # to store calculated similarities
    for movieId, movie_ratings in pivot_table.items(): # iterating over each row in our pivot table
        if movieId != input_movieId: # exclude input movie from similarity calculations
            similarity = calculate_cosine_similarity(input_movie_ratings, movie_ratings) # get cosine sim. b/w input movie ratings vector and every other movie's ratings vector
            similarities.append((movieId, similarity)) # append cal. similarity to similarities list

    # return a dataframe of the calculated similarities
    similarity_df = pd.DataFrame(similarities, columns=['anime_name', 'similarity'])
    return similarity_df

In [32]:
# extract all user ratings for input movie
input_anime_ratings = pivot_table[input_anime_name]

In [33]:
# Calculate cosine similarity for the input movie and all other movies.
movie_similarity_df = calculate_anime_similarity(input_anime_name, pivot_table)

In [34]:
# Sort the movie similarity DataFrame by similarity in descending order.
movie_similarity_df = movie_similarity_df.sort_values(by='similarity', ascending=False)

# Get the top 3 recommendations based on highest similarity.
top_3_recommendations = movie_similarity_df.nlargest(3, 'similarity')

# make sure indexes pair with number of rows
top_3_recommendations = top_3_recommendations.reset_index()

print("Top 3 movie recommendations for the user:\n")

# Output the three most similar movie titles and corresponding similarities
for index, row in top_3_recommendations.iterrows():
    print(f"Title: {row['anime_name']} - Similarity: {row['similarity']}")

Top 3 movie recommendations for the user:

Title: Cowboy Bebop:The Movie - Similarity: 0.5423642554594847
Title: Samurai Champloo - Similarity: 0.4347941447747884
Title: Ghost in the Shell - Similarity: 0.3862780950883836
