In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

from surprise import Dataset, Reader, NormalPredictor, BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline
from surprise import accuracy
from tqdm import tqdm
from statistics import mean

In [118]:
anime = pd.read_csv('anime.csv')

In [119]:
anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [120]:
#checking null values
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [121]:
synopsis = pd.read_csv('anime_with_synopsis.csv')

In [122]:
synopsis.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [123]:
#dropping the name, score, genres of the synopsis and joining it with anime
synopsis = synopsis.drop(['Name', 'Score', 'Genres'], axis=1)

In [124]:
anime = anime.merge(synopsis, how='outer', on='MAL_ID' )

In [125]:
anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...


In [126]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [127]:
#remove NA
anime = anime[anime['Episodes']!= 'Unknown']

In [128]:
anime['MAL_ID'].duplicated().any()

False

In [129]:
anime = anime.reset_index()
anime = anime.drop(['index'], axis =1)

In [130]:
anime['new_anime_index'] = anime.index
anime.shape

(17046, 37)

In [131]:
anime[['Score', 'Score-10','Score-9','Score-8','Score-7','Score-6','Score-5','Score-4','Score-3','Score-2','Score-1']] = anime[['Score', 'Score-10','Score-9','Score-8','Score-7','Score-6','Score-5','Score-4','Score-3','Score-2','Score-1']].replace('Unknown', 0)

anime[['Score', 'Score-10','Score-9','Score-8','Score-7','Score-6','Score-5','Score-4','Score-3','Score-2','Score-1']] = anime[['Score', 'Score-10','Score-9','Score-8','Score-7','Score-6','Score-5','Score-4','Score-3','Score-2','Score-1']].apply(pd.to_numeric)

anime['no_of_scores'] = anime['Score-10'] + anime['Score-9'] + anime['Score-8'] + anime['Score-7'] + anime['Score-6'] + anime['Score-5'] + anime['Score-4'] + anime['Score-3'] + anime['Score-2'] + anime['Score-1']


In [132]:
top15_score = anime.sort_values('Score', ascending=False).iloc[:15][['English name', 'Japanese name', 'Score', 'no_of_scores']]
top15_score

Unnamed: 0,English name,Japanese name,Score,no_of_scores
3964,Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,9.19,1438767.0
15740,Attack on Titan Final Season,進撃の巨人 The Final Season,9.17,288274.0
5672,Steins;Gate,STEINS;GATE,9.11,989905.0
14828,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,9.1,728435.0
9892,Gintama Season 4,銀魂°,9.1,161812.0
6462,Hunter x Hunter,HUNTER×HUNTER（ハンター×ハンター）,9.1,1026866.0
5995,Gintama Season 2,銀魂',9.08,162866.0
739,Legend of the Galactic Heroes,銀河英雄伝説,9.07,58655.0
7249,Gintama:Enchousen,銀魂' 延長戦,9.04,113662.0
9865,A Silent Voice,聲の形,9.0,940843.0


In [133]:
#Above is general recommendation using Popularity of the anime, based on average Score

In [134]:
top10_score_count = anime.sort_values('no_of_scores', ascending=False).iloc[:10][['English name', 'Japanese name', 'Score', 'no_of_scores']]
top10_score_count

Unnamed: 0,English name,Japanese name,Score,no_of_scores
1389,Death Note,デスノート,8.63,1826691.0
7437,Attack on Titan,進撃の巨人,8.48,1791099.0
6602,Sword Art Online,ソードアート・オンライン,7.25,1574372.0
10420,One Punch Man,ワンパンマン,8.57,1478645.0
3964,Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,9.19,1438767.0
11147,My Hero Academia,僕のヒーローアカデミア,8.11,1305467.0
10,Naruto,ナルト,7.91,1268593.0
8633,Tokyo Ghoul,東京喰種-トーキョーグール-,7.81,1260191.0
11269,Your Name.,君の名は。,8.96,1190759.0
8135,"No Game, No Life",ノーゲーム・ノーライフ,8.2,1137376.0


In [None]:
#This is based on number of scores

In [135]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17046 entries, 0 to 17045
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MAL_ID           17046 non-null  int64  
 1   Name             17046 non-null  object 
 2   Score            17046 non-null  float64
 3   Genres           17046 non-null  object 
 4   English name     17046 non-null  object 
 5   Japanese name    17046 non-null  object 
 6   Type             17046 non-null  object 
 7   Episodes         17046 non-null  object 
 8   Aired            17046 non-null  object 
 9   Premiered        17046 non-null  object 
 10  Producers        17046 non-null  object 
 11  Licensors        17046 non-null  object 
 12  Studios          17046 non-null  object 
 13  Source           17046 non-null  object 
 14  Duration         17046 non-null  object 
 15  Rating           17046 non-null  object 
 16  Ranked           17046 non-null  object 
 17  Popularity  

In [136]:
anime_features = anime[['new_anime_index', 'Name', 'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Genres']]
anime_features.head()

Unnamed: 0,new_anime_index,Name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Genres
0,0,Cowboy Bebop,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,1,Cowboy Bebop: Tengoku no Tobira,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,"Action, Drama, Mystery, Sci-Fi, Space"
2,2,Trigun,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,3,Witch Hunter Robin,TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,"Action, Mystery, Police, Supernatural, Drama, ..."
4,4,Bouken Ou Beet,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,"Adventure, Fantasy, Shounen, Supernatural"


In [137]:
anime_features = anime_features.drop(['Premiered', 'Producers', 'Duration', 'Licensors'], axis=1)

In [138]:
anime_features.head()

Unnamed: 0,new_anime_index,Name,Type,Episodes,Aired,Studios,Source,Genres
0,0,Cowboy Bebop,TV,26,"Apr 3, 1998 to Apr 24, 1999",Sunrise,Original,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,1,Cowboy Bebop: Tengoku no Tobira,Movie,1,"Sep 1, 2001",Bones,Original,"Action, Drama, Mystery, Sci-Fi, Space"
2,2,Trigun,TV,26,"Apr 1, 1998 to Sep 30, 1998",Madhouse,Manga,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,3,Witch Hunter Robin,TV,26,"Jul 2, 2002 to Dec 24, 2002",Sunrise,Original,"Action, Mystery, Police, Supernatural, Drama, ..."
4,4,Bouken Ou Beet,TV,52,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,Manga,"Adventure, Fantasy, Shounen, Supernatural"


In [139]:
anime_features.Type.unique()

array(['TV', 'Movie', 'OVA', 'Special', 'ONA', 'Music', 'Unknown'],
      dtype=object)

In [140]:
anime_features.Episodes.unique()

array(['26', '1', '52', '145', '24', '74', '220', '178', '12', '22', '69',
       '25', '4', '94', '5', '3', '13', '23', '43', '6', '50', '47', '51',
       '49', '39', '8', '7', '75', '62', '14', '44', '45', '64', '101',
       '27', '161', '2', '153', '70', '78', '42', '11', '167', '150',
       '366', '9', '16', '38', '48', '10', '76', '40', '20', '37', '41',
       '112', '224', '180', '296', '358', '63', '276', '46', '54', '15',
       '21', '35', '124', '86', '102', '36', '67', '291', '110', '29',
       '55', '201', '142', '109', '34', '136', '32', '73', '114', '19',
       '195', '58', '155', '96', '103', '113', '104', '192', '191', '203',
       '56', '500', '80', '172', '65', '117', '28', '61', '30', '148',
       '128', '100', '17', '243', '92', '105', '79', '31', '1787', '53',
       '33', '130', '18', '97', '193', '115', '170', '66', '330', '108',
       '68', '119', '95', '137', '60', '77', '72', '127', '99', '373',
       '300', '163', '91', '88', '154', '156', '694', '8

In [141]:
anime_features['Episodes'] = pd.to_numeric(anime_features['Episodes'].copy())

In [142]:
max_episodes = anime_features['Episodes'].max()

In [143]:
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(1, "One Episode")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(2,11), "Short")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(11,15), "3-months")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(15,21), "4-5-months")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(21,27), "6-months")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(27,51), "Long")
anime_features['Episodes'] = anime_features['Episodes'].copy().replace(range(51,max_episodes+1), "Very Long")

anime_features.Episodes.unique()

array(['6-months', 'One Episode', 'Very Long', '3-months', 'Short',
       'Long', '4-5-months'], dtype=object)

In [144]:
from datetime import datetime

lstAired = []

for date in anime_features.Aired:

    #seperate the date
    x = date.split(' to ')

    #if the date follows the following format: eg. Jan 01, 2000
    try:
        date_object = datetime.strptime(x[0], "%b %d, %Y")
    except:

        #if the date follows the following format: eg. Jan, 2000
        try:
            date_object = datetime.strptime(x[0], "%b, %Y")
        except:

            #if the date follows the following format: eg. 2000
            try:
                date_object = datetime.strptime(x[0], "%Y")
            except:

                #append unknown values to the list
                lstAired.append(x[0])
                continue

    #append the year value to the list
    lstAired.append(date_object.year)

In [145]:
lstEra = []

for t in lstAired:

    if t == 'Unknown':
        lstEra.append("Unknown")
        continue

    if t < 2000:
        lstEra.append("Very Old")
    elif t < 2005:
        lstEra.append("Old")
    elif t < 2010:
        lstEra.append("Modern")
    elif t < 2015:
        lstEra.append("Recent")
    else:
        lstEra.append("New")

anime_features['Era'] = lstEra
anime_features

Unnamed: 0,new_anime_index,Name,Type,Episodes,Aired,Studios,Source,Genres,Era
0,0,Cowboy Bebop,TV,6-months,"Apr 3, 1998 to Apr 24, 1999",Sunrise,Original,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Very Old
1,1,Cowboy Bebop: Tengoku no Tobira,Movie,One Episode,"Sep 1, 2001",Bones,Original,"Action, Drama, Mystery, Sci-Fi, Space",Old
2,2,Trigun,TV,6-months,"Apr 1, 1998 to Sep 30, 1998",Madhouse,Manga,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Very Old
3,3,Witch Hunter Robin,TV,6-months,"Jul 2, 2002 to Dec 24, 2002",Sunrise,Original,"Action, Mystery, Police, Supernatural, Drama, ...",Old
4,4,Bouken Ou Beet,TV,Very Long,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,Manga,"Adventure, Fantasy, Shounen, Supernatural",Old
...,...,...,...,...,...,...,...,...,...
17041,17041,Kitarou Tanjou: Gegege no Nazo,Movie,One Episode,Unknown,Unknown,Manga,"Comedy, Demons, Supernatural, Shounen",Unknown
17042,17042,"The Sun, Moon and Stars",Music,One Episode,"Jan 18, 2021",Unknown,Other,Music,New
17043,17043,Mahoutsukai no Yome: Nishi no Shounen to Seira...,OVA,Short,"Sep 10, 2021 to ?",Studio Kafka,Manga,"Slice of Life, Magic, Fantasy, Shounen",New
17044,17044,SK∞: Crazy Rock Jam,Special,One Episode,"Mar 14, 2021",Bones,Original,"Comedy, Sports",New


In [146]:
anime_features.Source.unique()

array(['Original', 'Manga', 'Light novel', 'Game', 'Visual novel',
       '4-koma manga', 'Novel', 'Unknown', 'Other', 'Picture book',
       'Web manga', 'Music', 'Radio', 'Book', 'Card game',
       'Digital manga'], dtype=object)

In [147]:
#break studios into lists within a list
studio_breakdown = []

for studio in anime_features.Studios:
    studio_lst = studio.split(', ')
    studio_breakdown.append(studio_lst)

In [148]:
studio_breakdown[0:20]

[['Sunrise'],
 ['Bones'],
 ['Madhouse'],
 ['Sunrise'],
 ['Toei Animation'],
 ['Gallop'],
 ['J.C.Staff'],
 ['Nippon Animation'],
 ['A.C.G.T.'],
 ['Madhouse'],
 ['Studio Pierrot'],
 ['Trans Arts'],
 ['Toei Animation'],
 ['Studio Comet'],
 ['Gonzo'],
 ['Madhouse'],
 ['Gonzo'],
 ['Sunrise'],
 ['Studio Deen'],
 ['Gainax', 'Tatsunoko Production']]

In [149]:
genres_breakdown = []

for genre in anime_features.Genres:
    genre_lst = genre.split(', ')
    genres_breakdown.append(genre_lst)

In [150]:
genres_breakdown[0:5]

[['Action', 'Adventure', 'Comedy', 'Drama', 'Sci-Fi', 'Space'],
 ['Action', 'Drama', 'Mystery', 'Sci-Fi', 'Space'],
 ['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'Drama', 'Shounen'],
 ['Action', 'Mystery', 'Police', 'Supernatural', 'Drama', 'Magic'],
 ['Adventure', 'Fantasy', 'Shounen', 'Supernatural']]

In [151]:
mlb_studio = MultiLabelBinarizer()

studio_breakdown_series = pd.Series(studio_breakdown)

studio_encoded = pd.DataFrame(mlb_studio.fit_transform(studio_breakdown_series),
                   columns=mlb_studio.classes_,
                   index=studio_breakdown_series.index)

studio_encoded.head()

Unnamed: 0,10Gauge,1IN,2:10 AM Animation,33 Collective,3xCube,81 Produce,8bit,A-1 Pictures,A-Line,A-Real,...,foodunited.,helo.inc,iDRAGONS Creative Studio,ixtl,l-a-unch・BOX,monofilmo,pH Studio,production doA,teamKG,ufotable
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [152]:
anime_features = pd.concat([anime_features, studio_encoded], axis=1)

In [153]:
anime_features.head()

Unnamed: 0,new_anime_index,Name,Type,Episodes,Aired,Studios,Source,Genres,Era,10Gauge,...,foodunited.,helo.inc,iDRAGONS Creative Studio,ixtl,l-a-unch・BOX,monofilmo,pH Studio,production doA,teamKG,ufotable
0,0,Cowboy Bebop,TV,6-months,"Apr 3, 1998 to Apr 24, 1999",Sunrise,Original,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Very Old,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Cowboy Bebop: Tengoku no Tobira,Movie,One Episode,"Sep 1, 2001",Bones,Original,"Action, Drama, Mystery, Sci-Fi, Space",Old,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Trigun,TV,6-months,"Apr 1, 1998 to Sep 30, 1998",Madhouse,Manga,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Very Old,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Witch Hunter Robin,TV,6-months,"Jul 2, 2002 to Dec 24, 2002",Sunrise,Original,"Action, Mystery, Police, Supernatural, Drama, ...",Old,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Bouken Ou Beet,TV,Very Long,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,Manga,"Adventure, Fantasy, Shounen, Supernatural",Old,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
mlb_genre = MultiLabelBinarizer()

genre_breakdown_series = pd.Series(genres_breakdown)

genre_encoded = pd.DataFrame(mlb_genre.fit_transform(genre_breakdown_series),
                   columns=mlb_genre.classes_,
                   index=genre_breakdown_series.index)

#concatenate anime feature dataframe with the genre encoded dataframe
anime_features = pd.concat([anime_features, genre_encoded], axis=1)

In [155]:
cat_variables = anime_features[['Type','Episodes', 'Source', 'Era']]
cat_dummies = pd.get_dummies(cat_variables)

#drop the original variables
anime_features = anime_features.drop(['Type','Episodes', 'Aired', 'Studios', 'Source', 'Era', 'Genres'], axis=1)
anime_features = pd.concat([anime_features, cat_dummies], axis=1)

In [156]:
anime_features.head()

Unnamed: 0,new_anime_index,Name,10Gauge,1IN,2:10 AM Animation,33 Collective,3xCube,81 Produce,8bit,A-1 Pictures,...,Source_Radio,Source_Unknown,Source_Visual novel,Source_Web manga,Era_Modern,Era_New,Era_Old,Era_Recent,Era_Unknown,Era_Very Old
0,0,Cowboy Bebop,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,1,Cowboy Bebop: Tengoku no Tobira,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,2,Trigun,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,3,Witch Hunter Robin,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,4,Bouken Ou Beet,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False


In [157]:
content_variables = anime_features.drop(['Name', 'new_anime_index'], axis=1)

content_variables.head()

Unnamed: 0,10Gauge,1IN,2:10 AM Animation,33 Collective,3xCube,81 Produce,8bit,A-1 Pictures,A-Line,A-Real,...,Source_Radio,Source_Unknown,Source_Visual novel,Source_Web manga,Era_Modern,Era_New,Era_Old,Era_Recent,Era_Unknown,Era_Very Old
0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False


In [158]:
cosine_sim_content = cosine_similarity(content_variables, content_variables)

cosine_sim_content.shape

(17046, 17046)

In [159]:
indices = pd.Series(anime_features.index, index=anime_features['Name']).drop_duplicates()
indices

Name
Cowboy Bebop                                                    0
Cowboy Bebop: Tengoku no Tobira                                 1
Trigun                                                          2
Witch Hunter Robin                                              3
Bouken Ou Beet                                                  4
                                                            ...  
Kitarou Tanjou: Gegege no Nazo                              17041
The Sun, Moon and Stars                                     17042
Mahoutsukai no Yome: Nishi no Shounen to Seiran no Kishi    17043
SK∞: Crazy Rock Jam                                         17044
Wan Jie Shen Zhu 3rd Season                                 17045
Length: 17046, dtype: int64

In [160]:
def get_recommendations(title, cosine_sim):

    # Get the index of the input anime
    idx = indices[title]

    # Get the similarity scores of all movies with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar animes, sim_scores[0] would be the anime itself
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return anime_features['Name'].iloc[anime_indices]

In [161]:
get_recommendations('Naruto', cosine_sim_content)

243                                                 Bleach
1570                                    Naruto: Shippuuden
131                                            Shaman King
732                                          Dragon Ball Z
4419                                       Dragon Ball Kai
8760                                Dragon Ball Kai (2014)
10590                                    Dragon Ball Super
212                                         Rekka no Honoo
555      Naruto: Takigakure no Shitou - Ore ga Eiyuu Da...
2867                                    Asobot Senki Gokuu
Name: Name, dtype: object

In [57]:
#We got the recommendation system

In [50]:
anime_synopsis = anime[['new_anime_index', 'Name', 'English name', 'Japanese name', 'sypnopsis']]
anime_synopsis.head()

Unnamed: 0,new_anime_index,Name,English name,Japanese name,sypnopsis
0,0,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,"In the year 2071, humanity has colonized sever..."
1,1,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,"other day, another bounty—such is the life of ..."
2,2,Trigun,Trigun,トライガン,"Vash the Stampede is the man with a $$60,000,0..."
3,3,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),ches are individuals with special powers like ...
4,4,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,It is the dark century and the people are suff...


In [51]:
anime_synopsis.shape

(17046, 5)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
anime_synopsis['sypnopsis'] = anime_synopsis['sypnopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(anime_synopsis['sypnopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_synopsis['sypnopsis'] = anime_synopsis['sypnopsis'].fillna('')


(17046, 44477)

In [53]:
cosine_sim_synopsis = linear_kernel(tfidf_matrix, tfidf_matrix)

In [55]:
get_recommendations('Noragami', cosine_sim_synopsis)

10540                                    Noragami Aragoto
8316                                         Noragami OVA
10663                                Noragami Aragoto OVA
941                                        Mizuiro (2003)
7731     Saikin, Imouto no Yousu ga Chotto Okashiinda ga.
1176                                         Afro Samurai
5512                                              Hiyokoi
8685                                              Washimo
5894          Sora no Otoshimono: Tokeijikake no Angeloid
7355                          Kamisama no Inai Nichiyoubi
Name: Name, dtype: object

In [63]:
#Now this is based on synposis as previous was based on genre

In [57]:
anime_index_df = anime[['MAL_ID', 'new_anime_index']]
anime_index_df.head()

Unnamed: 0,MAL_ID,new_anime_index
0,1,0
1,5,1
2,6,2
3,7,3
4,8,4


In [58]:
anime_rating = pd.read_csv("rating_complete.csv")
anime_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [59]:
anime_rating = anime_rating.merge(anime_index_df, how='left', left_on='anime_id', right_on='MAL_ID')
anime_rating.head()

Unnamed: 0,user_id,anime_id,rating,MAL_ID,new_anime_index
0,0,430,9,430.0,401.0
1,0,1004,5,1004.0,906.0
2,0,3010,7,3010.0,2739.0
3,0,570,7,570.0,533.0
4,0,2762,9,2762.0,2538.0


In [60]:
anime_rating = anime_rating.drop(['anime_id', 'MAL_ID'], axis=1)
anime_rating.head()

Unnamed: 0,user_id,rating,new_anime_index
0,0,9,401.0
1,0,5,906.0
2,0,7,2739.0
3,0,7,533.0
4,0,9,2538.0


In [63]:
anime_rating_cut = anime_rating[anime_rating['user_id'].isin(list(range(0,6000)))]

In [64]:
anime_rating_cut.info()

<class 'pandas.core.frame.DataFrame'>
Index: 931948 entries, 0 to 931947
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_id          931948 non-null  int64  
 1   rating           931948 non-null  int64  
 2   new_anime_index  931946 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 28.4 MB


In [65]:
#animes with missing new_anime_index, these animes were removed from the dataframe in the start for having 0 episodes
anime_rating_cut[anime_rating_cut['new_anime_index'].isna()]

Unnamed: 0,user_id,rating,new_anime_index
155111,943,7,
742432,4892,5,


In [66]:
#select the rows where there are no missing value
anime_rating_cut = anime_rating_cut.loc[anime_rating_cut['new_anime_index'].notnull()]
#chagne data type
anime_rating_cut['new_anime_index'] = anime_rating_cut['new_anime_index'].astype(int)

In [67]:
#define rating scale
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(anime_rating_cut[["user_id", "new_anime_index", "rating"]], reader)

In [105]:
anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,sypnopsis,new_anime_index,no_of_scores
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever...",0,641705.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ...",1,160349.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0...",2,286146.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...,3,39094.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...,4,5923.0


In [68]:
trainset, testset = train_test_split(data, test_size=0.25)

In [71]:
models = {"Normal Predictor": NormalPredictor(),
          "Baseline Only": BaselineOnly(),
          "KNNBasic": KNNBasic(),
          "KNNBaseline": KNNBaseline(),
          "SVD": SVD()}

#create dataframe to save results
model_comparison_df = pd.DataFrame(columns=['model_name', 'test_rmse', 'test_mae', 'fit_time', 'test_time'])

#loop over each algorithm
for algo_name, algo in tqdm(models.items()):

    #perform cross validation
    cv_dict = cross_validate(algo, data, cv=3)

    #save results into dataframe
    model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({'model_name': algo_name,
                                                                    'test_rmse': mean(cv_dict['test_rmse']),
                                                                    'test_mae': mean(cv_dict['test_mae']),
                                                                    'fit_time': mean(cv_dict['fit_time']),
                                                                    'test_time': mean(cv_dict['test_time'])}, index=[0])], ignore_index=True)


  model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({'model_name': algo_name,
 20%|██        | 1/5 [00:12<00:49, 12.32s/it]

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


 40%|████      | 2/5 [00:29<00:46, 15.46s/it]

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


 60%|██████    | 3/5 [05:52<05:10, 155.47s/it]

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


100%|██████████| 5/5 [12:44<00:00, 152.97s/it]


In [72]:
model_comparison_df

Unnamed: 0,model_name,test_rmse,test_mae,fit_time,test_time
0,Normal Predictor,2.305572,1.825231,0.774533,1.657546
1,Baseline Only,1.249874,0.944297,2.479343,1.868961
2,KNNBasic,1.328454,0.990854,17.851231,88.238242
3,KNNBaseline,1.224248,0.922146,20.806385,102.909499
4,SVD,1.223157,0.919524,8.007857,2.895013


In [73]:
svd_param_grid = {'n_factors': [150, 200],
                  'n_epochs': [20, 30],
                  'lr_all': [0.005],
                  'reg_all': [0.02, 0.05],
                  'biased': [True],
                  'random_state': [42]}

svd_gs = GridSearchCV(SVD, svd_param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1, joblib_verbose=3)

svd_gs.fit(data)

# best RMSE score
print(svd_gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(svd_gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:  1.6min remaining:   31.6s


1.1906168722585404
{'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.05, 'biased': True, 'random_state': 42}


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.8min finished


In [74]:
best_SVD = svd_gs.best_estimator["rmse"]

#train the model with the training dataset
best_SVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23924506090>

In [75]:
predictions = best_SVD.test(testset)
predictions

[Prediction(uid=299, iid=2265, r_ui=10.0, est=7.813867252943984, details={'was_impossible': False}),
 Prediction(uid=5982, iid=812, r_ui=8.0, est=8.126719317634976, details={'was_impossible': False}),
 Prediction(uid=3061, iid=8356, r_ui=7.0, est=7.134536122488264, details={'was_impossible': False}),
 Prediction(uid=4831, iid=13053, r_ui=9.0, est=8.555011030191663, details={'was_impossible': False}),
 Prediction(uid=5354, iid=199, r_ui=10.0, est=9.203274249027295, details={'was_impossible': False}),
 Prediction(uid=54, iid=12461, r_ui=8.0, est=8.963719377579995, details={'was_impossible': False}),
 Prediction(uid=2292, iid=6728, r_ui=9.0, est=8.378932542838447, details={'was_impossible': False}),
 Prediction(uid=2189, iid=790, r_ui=8.0, est=7.810212414322203, details={'was_impossible': False}),
 Prediction(uid=1967, iid=7271, r_ui=4.0, est=4.704274929203445, details={'was_impossible': False}),
 Prediction(uid=5874, iid=6393, r_ui=7.0, est=6.434067695283032, details={'was_impossible': F

In [109]:
anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,sypnopsis,new_anime_index,no_of_scores
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"In the year 2071, humanity has colonized sever...",0,641705.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"other day, another bounty—such is the life of ...",1,160349.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Vash the Stampede is the man with a $$60,000,0...",2,286146.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,ches are individuals with special powers like ...,3,39094.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,It is the dark century and the people are suff...,4,5923.0


In [108]:
import pickle

In [110]:
pickle.dump(anime_features, open('anime.pkl', 'wb'))

In [111]:
pickle.dump(cosine_similarity, open('cosine_sim.pkl', 'wb'))

In [112]:
with open('cosine_sim_content.pkl', 'wb') as f:
    pickle.dump(cosine_sim_content, f)

In [113]:
indices = pd.Series(anime_features.index, index=anime_features['Name']).drop_duplicates()
with open('indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

In [162]:
anime_features.head()

Unnamed: 0,new_anime_index,Name,10Gauge,1IN,2:10 AM Animation,33 Collective,3xCube,81 Produce,8bit,A-1 Pictures,...,Source_Radio,Source_Unknown,Source_Visual novel,Source_Web manga,Era_Modern,Era_New,Era_Old,Era_Recent,Era_Unknown,Era_Very Old
0,0,Cowboy Bebop,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,1,Cowboy Bebop: Tengoku no Tobira,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,2,Trigun,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,3,Witch Hunter Robin,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,4,Bouken Ou Beet,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False


In [163]:
pickle.dump(anime_features, open('anime_features.pkl', 'wb'))

In [164]:
anime_features.head()

Unnamed: 0,new_anime_index,Name,10Gauge,1IN,2:10 AM Animation,33 Collective,3xCube,81 Produce,8bit,A-1 Pictures,...,Source_Radio,Source_Unknown,Source_Visual novel,Source_Web manga,Era_Modern,Era_New,Era_Old,Era_Recent,Era_Unknown,Era_Very Old
0,0,Cowboy Bebop,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,1,Cowboy Bebop: Tengoku no Tobira,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,2,Trigun,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,3,Witch Hunter Robin,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,4,Bouken Ou Beet,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False


In [165]:
anime_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17046 entries, 0 to 17045
Columns: 790 entries, new_anime_index to Era_Very Old
dtypes: bool(36), int32(752), int64(1), object(1)
memory usage: 49.7+ MB
