In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
movies = pd.read_csv('dataset/movies.csv')
movie_ratings = pd.read_csv('dataset/ratings.csv')
tags = pd.read_csv('dataset/tags.csv')

In [34]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [37]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [38]:
movies['genres'] = movies['genres'].str.replace('|',' ')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:
len(movies.movieId.unique())

9742

In [40]:
len(movie_ratings.movieId.unique())

9724

In [41]:
ratings_f = movie_ratings.groupby('userId').filter(lambda x: len(x) >= 25)

movie_list_rating = ratings_f.movieId.unique().tolist()

In [42]:
movies = movies[movies.movieId.isin(movie_list_rating)]
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [43]:
Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))

In [44]:
tags.drop(['timestamp'],1, inplace = True)
ratings_f.drop(['timestamp'],1, inplace = True)

In [46]:
mixed = pd.merge(movies, tags, on = 'movieId', how = 'left')
mixed.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game


In [58]:
mixed.fillna("", inplace = True)
mixed = pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

Final = pd.merge(movies, mixed, on ='movieId', how = 'left')
Final ['metadata'] = Final[['tag', 'genres']].apply(lambda x: "%s" % ' '.join(x), axis = 1)
Final [['movieId', 'title', 'metadata']].head()

Unnamed: 0,movieId,title,metadata
0,1,Toy Story (1995),pixar pixar fun Adventure Animation Children C...
1,2,Jumanji (1995),fantasy magic board game Robin Williams game A...
2,3,Grumpier Old Men (1995),moldy old Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),pregnancy remake Comedy


In [60]:
Final.shape

(9710, 5)

In [74]:
Final.loc[0, "metadata"]

'pixar pixar fun Adventure Animation Children Comedy Fantasy'

In [68]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(Final['metadata'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index = Final.index.tolist())
tfidf_df.shape

(9710, 1675)

In [75]:
svd = TruncatedSVD(n_components = 200)
latent_matrix = svd.fit_transform(tfidf_df)

In [90]:
n = 200
latent_matrix_1_df = pd.DataFrame(latent_matrix[:,0:n], index=Final.title.tolist())
latent_matrix_1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
Toy Story (1995),0.102158,0.083047,0.069289,0.083118,-0.156903,0.011974,0.235969,-0.009242,-0.121719,-0.007529,...,0.197122,-0.136922,-0.004078,0.066743,0.065500,0.085366,-0.032406,-0.016378,-0.007165,-0.114275
Jumanji (1995),0.032512,0.008901,0.058912,0.085837,-0.111296,0.007452,0.189158,-0.006138,-0.074941,0.002341,...,-0.001020,-0.002153,0.012328,0.006079,-0.021275,-0.002568,0.034804,0.012707,-0.003943,-0.023319
Grumpier Old Men (1995),0.147929,0.105287,-0.042869,0.127408,0.091656,-0.037793,-0.011973,0.000654,0.004599,-0.002348,...,0.008500,0.013643,-0.006800,-0.014172,-0.006000,-0.004957,0.015421,0.017552,-0.018041,-0.004478
Waiting to Exhale (1995),0.825540,0.065848,-0.291994,0.391761,0.252713,-0.076026,-0.046193,-0.000831,0.027587,-0.000996,...,-0.000171,0.000127,-0.000294,-0.000040,-0.000477,-0.000022,0.000071,-0.000044,-0.000065,-0.000391
Father of the Bride Part II (1995),0.112902,0.121176,-0.006429,-0.037674,-0.005234,0.010013,-0.009192,-0.002280,0.003953,-0.011451,...,-0.002695,0.001568,0.022343,0.013517,0.017488,0.006365,-0.003982,-0.008544,-0.007223,-0.011605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.329171,0.223252,0.299166,0.141833,-0.339033,-0.097167,0.313752,-0.015254,0.045440,0.079517,...,-0.000226,0.000087,0.000228,-0.000378,-0.000018,-0.000651,0.000013,-0.000275,0.000095,0.000156
No Game No Life: Zero (2017),0.296554,0.276316,0.097337,0.098611,-0.234767,0.058731,0.403508,-0.017741,-0.239908,0.016441,...,-0.000226,0.000067,0.000331,-0.000375,-0.000104,-0.000565,0.000169,-0.000535,0.000109,0.000273
Flint (2017),0.634674,-0.666189,-0.297193,-0.136463,-0.160514,0.126531,-0.007345,-0.006789,0.023932,0.015826,...,-0.000087,-0.000039,-0.000194,0.000039,-0.000028,0.000135,0.000150,0.000077,-0.000046,0.000107
Bungo Stray Dogs: Dead Apple (2018),0.113440,-0.011129,0.338671,0.159746,-0.320756,-0.183555,0.176042,-0.006810,0.142285,0.098964,...,-0.000312,0.000226,-0.000178,-0.000094,-0.000249,-0.000833,0.000104,0.000081,-0.000007,0.000225


In [82]:
#collaborative filtering
ratings_f.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [88]:
ratings_f1 = pd.merge(movies[['movieId']], rating_f, on = "movieId", how = "right")
ratings_f2 = ratings_f1.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
ratings_f2.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
svd = TruncatedSVD(n_components = 200)
latent_matrix_2 = svd.fit_transform(ratings_f2)

In [91]:
latent_matrix_2_df = pd.DataFrame(latent_matrix_2, index=Final.title.tolist())
latent_matrix_2_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
Toy Story (1995),37.585275,-6.155013,15.178465,0.648999,-2.155746,4.309913,8.827334,-2.007243,1.225567,1.420622,...,0.053744,0.557751,-2.867201,2.370011,1.241344,3.147764,-1.544856,1.335073,-0.204222,1.338050
Jumanji (1995),20.597430,-0.377713,11.171060,-8.412376,-3.262701,-1.017596,4.342062,-4.025292,-3.277967,0.285509,...,0.898516,-0.035493,0.073464,0.433294,-0.965159,-0.916174,-1.561707,0.666125,-2.721541,0.170480
Grumpier Old Men (1995),8.491590,-5.660520,3.662447,-6.338464,-0.065760,-0.958899,-1.445177,-0.497552,3.263445,1.925484,...,0.635624,-0.170522,-0.079647,-0.277091,1.124030,-0.460307,1.021475,-0.082483,1.320864,1.439337
Waiting to Exhale (1995),0.465413,-0.818077,0.971509,-0.218960,-0.719577,-0.944040,0.205715,-0.548523,0.097428,0.028577,...,0.231586,-0.119650,-0.021619,-0.012028,-0.037836,0.043073,-0.066907,0.042441,-0.243716,0.024605
Father of the Bride Part II (1995),6.042813,-2.574425,4.788804,-4.298315,-3.807926,-1.995422,1.238832,-1.358939,1.353073,1.267100,...,0.519129,0.532371,0.215535,-0.362412,-0.350177,0.181602,-0.849322,0.267871,0.783636,0.354465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.039574,0.158111,-0.023642,0.009110,0.029973,-0.164117,0.181472,0.071945,0.005506,-0.047007,...,-0.498326,0.447624,0.630257,0.133991,-0.134691,-0.111665,0.299534,-0.133751,-0.172619,0.049423
No Game No Life: Zero (2017),0.034627,0.138347,-0.020686,0.007971,0.026226,-0.143602,0.158788,0.062952,0.004818,-0.041132,...,-0.436035,0.391671,0.551475,0.117242,-0.117855,-0.097707,0.262092,-0.117032,-0.151041,0.043245
Flint (2017),0.034627,0.138347,-0.020686,0.007971,0.026226,-0.143602,0.158788,0.062952,0.004818,-0.041132,...,-0.436035,0.391671,0.551475,0.117242,-0.117855,-0.097707,0.262092,-0.117032,-0.151041,0.043245
Bungo Stray Dogs: Dead Apple (2018),0.034627,0.138347,-0.020686,0.007971,0.026226,-0.143602,0.158788,0.062952,0.004818,-0.041132,...,-0.436035,0.391671,0.551475,0.117242,-0.117855,-0.097707,0.262092,-0.117032,-0.151041,0.043245


In [106]:
a_1 = np.array(latent_matrix_1_df.loc['Toy Story (1995)']).reshape(1, -1)
a_2 = np.array(latent_matrix_2_df.loc['Toy Story (1995)']).reshape(1,-1)

score_1 = cosine_similarity(latent_matrix_1_df, a_1).reshape(1, -1)
score_2 = cosine_similarity(latent_matrix_2_df, a_2).reshape(1, -1)

hybrid = ((score_1 + score_2)/2.0)

dictDf = {'content': score_1, 'collaborative': score_2, 'hybrid':hybrid}
similar = pd.DataFrame(dictDf, index = latent_matrix_1_df.index)

Exception: Data must be 1-dimensional