In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [48]:
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [49]:
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres']

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [50]:
ratings = pd.read_csv('ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [4]:
links = pd.read_csv('links.csv')
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [7]:
links.isna().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [22]:
links.fillna(0, inplace=True)
links.isna().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [29]:
tags = pd.read_csv('tags.csv')
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [30]:
tags['tag'] = tags['tag'].apply(lambda x: x.lower())
tags['tag']

0                  funny
1        highly quotable
2           will ferrell
3           boxing story
4                    mma
              ...       
3678           for katie
3679             austere
3680              gun fu
3681    heroic bloodshed
3682    heroic bloodshed
Name: tag, Length: 3683, dtype: object

In [34]:
result_df = pd.merge(ratings, movies, on='movieId', how='inner')
result_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,5,1,4.0,847434962,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,7,1,4.5,1106635946,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
3,15,1,2.5,1510577970,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
4,17,1,4.5,1305696483,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),"[Action, Thriller]"
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),"[Action, Crime, Drama]"
100833,610,160836,3.0,1493844794,Hazard (2005),"[Action, Drama, Thriller]"
100834,610,163937,3.5,1493848789,Blair Witch (2016),"[Horror, Thriller]"


In [56]:
from surprise import Dataset
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader

In [42]:
# from sklearn.neighbors import NearestNeighbors
# nn_algo = NearestNeighbors(metric='cosine')
# nn_algo.fit(rating_pivot)

In [61]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
train_df, test_df = train_test_split(data, test_size=0.25, random_state=42)

In [64]:
knn = KNNBasic()
knn.fit(train_df)
knn_pred = knn.test(test_df)
for pred in knn_pred[:10]:
    print(pred)
print('RMSE:', accuracy.rmse(knn_pred))

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 50         item: 4282       r_ui = 3.50   est = 3.50   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 603        item: 2993       r_ui = 3.00   est = 3.45   {'actual_k': 14, 'was_impossible': False}
user: 140        item: 11         r_ui = 4.00   est = 3.78   {'actual_k': 40, 'was_impossible': False}
user: 262        item: 497        r_ui = 4.00   est = 3.85   {'actual_k': 30, 'was_impossible': False}
user: 492        item: 1363       r_ui = 4.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 117        item: 282        r_ui = 3.00   est = 3.17   {'actual_k': 24, 'was_impossible': False}
user: 136        item: 185        r_ui = 3.00   est = 3.01   {'actual_k': 40, 'was_impossible': False}
user: 232        item: 58293      r_ui = 2.00   est = 2.76   {'actual_k': 14, 'was_impossible': False}
user: 288        item: 2420       r_ui = 3.00   est = 3.61   {'actual_k': 28, '

In [66]:
knnWM = KNNWithMeans()
knnWM.fit(train_df)
knnWM_pred = knnWM.test(test_df)
for pred in knnWM_pred[:10]:
    print(pred)
print('RMSE:', accuracy.rmse(knnWM_pred))

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 50         item: 4282       r_ui = 3.50   est = 3.50   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 603        item: 2993       r_ui = 3.00   est = 3.57   {'actual_k': 14, 'was_impossible': False}
user: 140        item: 11         r_ui = 4.00   est = 3.67   {'actual_k': 40, 'was_impossible': False}
user: 262        item: 497        r_ui = 4.00   est = 3.58   {'actual_k': 30, 'was_impossible': False}
user: 492        item: 1363       r_ui = 4.00   est = 4.04   {'actual_k': 1, 'was_impossible': False}
user: 117        item: 282        r_ui = 3.00   est = 3.08   {'actual_k': 24, 'was_impossible': False}
user: 136        item: 185        r_ui = 3.00   est = 2.78   {'actual_k': 40, 'was_impossible': False}
user: 232        item: 58293      r_ui = 2.00   est = 2.46   {'actual_k': 14, 'was_impossible': False}
user: 288        item: 2420       r_ui = 3.00   est = 3.27   {'actual_k': 28, '

In [65]:
svd = SVD()
svd.fit(train_df)
svd_pred = svd.test(test_df)
for pred in svd_pred[:10]:
    print(pred)
print('RMSE:', accuracy.rmse(svd_pred))

user: 50         item: 4282       r_ui = 3.50   est = 2.68   {'was_impossible': False}
user: 603        item: 2993       r_ui = 3.00   est = 3.27   {'was_impossible': False}
user: 140        item: 11         r_ui = 4.00   est = 3.83   {'was_impossible': False}
user: 262        item: 497        r_ui = 4.00   est = 3.49   {'was_impossible': False}
user: 492        item: 1363       r_ui = 4.00   est = 4.15   {'was_impossible': False}
user: 117        item: 282        r_ui = 3.00   est = 3.11   {'was_impossible': False}
user: 136        item: 185        r_ui = 3.00   est = 3.19   {'was_impossible': False}
user: 232        item: 58293      r_ui = 2.00   est = 2.89   {'was_impossible': False}
user: 288        item: 2420       r_ui = 3.00   est = 3.28   {'was_impossible': False}
user: 357        item: 2968       r_ui = 3.00   est = 4.06   {'was_impossible': False}
RMSE: 0.8823
RMSE: 0.8822733049571
