In [44]:
import pandas as pd
from math import sqrt
import numpy as np

In [45]:
anime_df = pd.read_csv('anime.csv')
rating_df = pd.read_csv('rating.csv')
print(anime_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None


In [46]:
userInput = [{'name':'Yosuga no Sora: In Solitude, Where We Are Least Alone.', 'rating':6.72},
             {'name':'Boku no Pico', 'rating':5.13},
             {'name':'Ore no Imouto ga Konnani Kawaii Wake ga Nai', 'rating':7.49},
             {'name':'Code Geass: Hangyaku no Lelouch R2', 'rating':8.98},
             {'name':'Initial D Final Stage', 'rating':8.29}]
inputAnimes = pd.DataFrame(userInput)
print(inputAnimes)

                                                name  rating
0  Yosuga no Sora: In Solitude, Where We Are Leas...    6.72
1                                       Boku no Pico    5.13
2        Ore no Imouto ga Konnani Kawaii Wake ga Nai    7.49
3                 Code Geass: Hangyaku no Lelouch R2    8.98
4                              Initial D Final Stage    8.29


In [47]:
inputId = anime_df[anime_df['name'].isin(inputAnimes['name'].tolist())]
inputAnimes = pd.merge(inputId, inputAnimes)
#inputAnimes = inputAnimes.drop('members') #we don't really need this at the moment
inputAnimes = inputAnimes[['anime_id','name','rating']]
print(inputAnimes)

   anime_id                                               name  rating
0      2904                 Code Geass: Hangyaku no Lelouch R2    8.98
1     22507                              Initial D Final Stage    8.29
2      8769        Ore no Imouto ga Konnani Kawaii Wake ga Nai    7.49
3      8861  Yosuga no Sora: In Solitude, Where We Are Leas...    6.72
4      1639                                       Boku no Pico    5.13


In [48]:
userSubset = rating_df[rating_df['anime_id'].isin(inputAnimes['anime_id'].tolist())]
print(userSubset.groupby('anime_id').count())

          user_id  rating
anime_id                 
1639         2475    2475
2904        24242   24242
8769        13825   13825
8861         6687    6687
22507         646     646


In [49]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['user_id'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((8094,),         user_id  anime_id  rating
889958     8094      1639      10
890033     8094      2904       9
890367     8094      8769       5
890371     8094      8861       6
890747     8094     22507       8), ((10283,),          user_id  anime_id  rating
1065794    10283      1639      -1
1065923    10283      2904      -1
1066300    10283      8769      -1
1066306    10283      8861      -1
1066868    10283     22507      -1), ((10654,),          user_id  anime_id  rating
1110889    10654      1639       5
1110948    10654      2904       6
1111160    10654      8769       5
1111163    10654      8861       6
1111427    10654     22507       7), ((11780,),          user_id  anime_id  rating
1248220    11780      1639       1
1248235    11780      2904      10
1248306    11780      8769       6
1248309    11780      8861       7
1248452    11780     22507       8), ((23496,),          user_id  anime_id  rating
2467456    23496      1639       6
2467518    23496      2904       

In [50]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='anime_id')
    inputAnimes = inputAnimes.sort_values(by='anime_id')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputAnimes[inputAnimes['anime_id'].isin(group['anime_id'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [51]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex   user_id
0        -0.163920   (8094,)
1         0.000000  (10283,)
2         0.599984  (10654,)
3         0.944598  (11780,)
4         0.647309  (23496,)


In [52]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex   user_id
41         0.999551   (7297,)
87         0.984858  (22434,)
17         0.984858    (958,)
64         0.980610  (16572,)
13         0.965954  (66390,)


In [53]:


# Datatypes were not compatable for doing ".merge", so i had to change the type
topUsers['user_id'] = topUsers['user_id'].astype(str).str.extract('(\d+)').astype('int64')

topUsersRating = pd.merge(topUsers, rating_df, on='user_id', how='inner')
print(topUsersRating.head(100))

    similarityIndex  user_id  anime_id  rating
0          0.999551     7297         1       9
1          0.999551     7297         5       9
2          0.999551     7297         6       8
3          0.999551     7297        16       8
4          0.999551     7297        18      10
..              ...      ...       ...     ...
95         0.999551     7297       355       7
96         0.999551     7297       356       8
97         0.999551     7297       357       5
98         0.999551     7297       376       8
99         0.999551     7297       379       9

[100 rows x 4 columns]


In [54]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  anime_id  rating  weightedRating
0         0.999551     7297         1       9        8.995961
1         0.999551     7297         5       9        8.995961
2         0.999551     7297         6       8        7.996410
3         0.999551     7297        16       8        7.996410
4         0.999551     7297        18      10        9.995513


In [55]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('anime_id').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

          sum_similarityIndex  sum_weightedRating
anime_id                                         
1                   22.752870          184.359253
5                   13.365848           96.172147
6                   21.934254          170.967695
7                    6.122495           36.624501
8                    2.629749           16.680777


In [56]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['anime_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

          weighted average recommendation score  anime_id
anime_id                                                 
1                                      8.102681         1
5                                      7.195364         5
6                                      7.794553         6
7                                      5.981957         7
8                                      6.343106         8
15                                     8.502588        15
16                                     7.790033        16
17                                     8.317239        17
18                                     8.688118        18
19                                     9.092398        19


In [57]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

          weighted average recommendation score  anime_id
anime_id                                                 
32547                                      10.0     32547
27881                                      10.0     27881
2269                                       10.0      2269
7578                                       10.0      7578
7416                                       10.0      7416
...                                         ...       ...
17823                                      -1.0     17823
31927                                      -1.0     31927
11969                                      -1.0     11969
556                                        -1.0       556
2571                                       -1.0      2571

[6093 rows x 2 columns]


In [58]:
recommended_anime=anime_df.loc[anime_df['anime_id'].isin(recommendation_df['anime_id'])]

#we don't want to recommend the same movie
recommended_anime=recommended_anime.loc[~recommended_anime.anime_id.isin(userSubset['anime_id'])]

print(recommended_anime)

       anime_id                              name  \
0         32281                    Kimi no Na wa.   
1          5114  Fullmetal Alchemist: Brotherhood   
2         28977                          Gintama°   
3          9253                       Steins;Gate   
4          9969                     Gintama&#039;   
...         ...                               ...   
12221      4369        Kunoichi Gakuen Ninpouchou   
12241      4833             Esper Bishoujo Manami   
12242      6025                       Battle Can²   
12248      5569                      Tsui no Sora   
12262     17823                        Ikenai Boy   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64   
2      Action, Comedy, Historical, Parody, Samurai, S...     TV       51   
3                                       Sci-Fi, Thriller   