## Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

## Importing the Whole Dataset

In [2]:
links = pd.read_csv('links.csv')

In [3]:
ratings = pd.read_csv('ratings.csv')

In [4]:
tags = pd.read_csv('tags.csv')

In [5]:
movies = pd.read_csv('movies.csv')

## User-Genre Matrix

#### All Genres in the Dataset

In [6]:
genres = []
for i in movies.index:
    genres.extend(movies['genres'][i].split('|'))
genres = list(set(genres))
genres.remove('(no genres listed)')

In [7]:
len(genres)

19

In [8]:
users = list(ratings['userId'].unique())

In [9]:
usergenre_matrix = pd.DataFrame(columns = genres, index = users)

In [10]:
movies_temp = movies.copy()
for i in range(9742):
    movies_temp['genres'].iloc[i] = (movies_temp['genres'].iloc[i]).split('|')
movies_temp = movies_temp.explode('genres')
movies_temp.index = np.arange(0,len(movies_temp))
movies_temp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
1,1,Toy Story (1995),Animation
2,1,Toy Story (1995),Children
3,1,Toy Story (1995),Comedy
4,1,Toy Story (1995),Fantasy
...,...,...,...
22079,193583,No Game No Life: Zero (2017),Fantasy
22080,193585,Flint (2017),Drama
22081,193587,Bungo Stray Dogs: Dead Apple (2018),Action
22082,193587,Bungo Stray Dogs: Dead Apple (2018),Animation


In [11]:
result = pd.merge(movies_temp, ratings, left_on = 'movieId', right_on = 'movieId')
result

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure,1,4.0,964982703
1,1,Toy Story (1995),Adventure,5,4.0,847434962
2,1,Toy Story (1995),Adventure,7,4.5,1106635946
3,1,Toy Story (1995),Adventure,15,2.5,1510577970
4,1,Toy Story (1995),Adventure,17,4.5,1305696483
...,...,...,...,...,...,...
274475,193583,No Game No Life: Zero (2017),Fantasy,184,3.5,1537109545
274476,193585,Flint (2017),Drama,184,3.5,1537109805
274477,193587,Bungo Stray Dogs: Dead Apple (2018),Action,184,3.5,1537110021
274478,193587,Bungo Stray Dogs: Dead Apple (2018),Animation,184,3.5,1537110021


In [12]:
x = result.groupby(['userId', 'genres']).mean()
x = x.drop(columns = ['movieId', 'timestamp'])
x

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,genres,Unnamed: 2_level_1
1,Action,4.322222
1,Adventure,4.388235
1,Animation,4.689655
1,Children,4.547619
1,Comedy,4.277108
...,...,...
610,Romance,3.731092
610,Sci-Fi,3.659363
610,Thriller,3.573529
610,War,3.776596


In [13]:
for i in genres:
    for j in users:
        try:
            usergenre_matrix[i][j] = x.loc[j, i][0]
        except:
            usergenre_matrix[i][j] = 0.0

In [14]:
usergenre_matrix

Unnamed: 0,Musical,Sci-Fi,Film-Noir,War,Children,Western,Thriller,Mystery,Adventure,Romance,IMAX,Drama,Comedy,Fantasy,Action,Horror,Documentary,Crime,Animation
1,4.68182,4.225,5,4.5,4.54762,4.28571,4.14545,4.16667,4.38824,4.30769,0,4.52941,4.27711,4.29787,4.32222,3.47059,0,4.35556,4.68966
2,0,3.875,0,4.5,0,3.5,3.7,4,4.16667,4.5,3.75,3.88235,4,0,3.95455,3,4.33333,3.8,0
3,0.5,4.2,0,0.5,0.5,0,4.14286,5,2.72727,0.5,0,0.75,1,3.375,3.57143,4.6875,0,0.5,0.5
4,4,2.83333,4,3.57143,3.8,3.8,3.55263,3.47826,3.65517,3.37931,3,3.48333,3.50962,3.68421,3.32,4.25,4,3.81481,4
5,4.4,2.5,0,3.33333,4.11111,3,3.55556,4,3.25,3.09091,3.66667,3.8,3.46667,4.14286,3.11111,3,0,3.83333,4.33333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.72727,3.55696,3.8125,3.79231,3.44898,3.41176,3.52513,3.79121,3.5034,3.74085,3.0625,3.78797,3.56532,3.59794,3.17881,3.34615,3.8,3.65414,3.71429
607,3.6,3.25,0,4.16667,3.42105,4,4.11475,4.64706,3.46667,3.51724,5,4.0122,3.32727,3.57143,3.72222,4.11429,0,3.81481,3.33333
608,2.75758,3.29641,3.75,3.57895,2.46023,2.63636,3.53668,3.55072,3.22099,2.88679,4,3.4375,2.73662,3,3.33032,3.31959,3,3.61301,3.11818
609,0,3,0,3.5,3,4,3.28571,0,3.2,3.2,3,3.36842,3.28571,3,3.09091,3.5,3,3.5,3


##### Normalize the User-Genre Matrix

In [15]:
usergenre_matrix=(usergenre_matrix-usergenre_matrix.min())/(usergenre_matrix.max()-usergenre_matrix.min())

In [16]:
usergenre_matrix

Unnamed: 0,Musical,Sci-Fi,Film-Noir,War,Children,Western,Thriller,Mystery,Adventure,Romance,IMAX,Drama,Comedy,Fantasy,Action,Horror,Documentary,Crime,Animation
1,0.936364,0.845,1,0.9,0.909524,0.857143,0.829091,0.833333,0.877647,0.861538,0,0.889273,0.855422,0.859574,0.864444,0.694118,0,0.871111,0.937931
2,0,0.775,0,0.9,0,0.7,0.74,0.8,0.833333,0.9,0.75,0.737024,0.8,0,0.790909,0.6,0.866667,0.76,0
3,0.1,0.84,0,0.1,0.1,0,0.828571,1,0.545455,0.1,0,0,0.2,0.675,0.714286,0.9375,0,0.1,0.1
4,0.8,0.566667,0.8,0.714286,0.76,0.76,0.710526,0.695652,0.731034,0.675862,0.6,0.643137,0.701923,0.736842,0.664,0.85,0.8,0.762963,0.8
5,0.88,0.5,0,0.666667,0.822222,0.6,0.711111,0.8,0.65,0.618182,0.733333,0.717647,0.693333,0.828571,0.622222,0.6,0,0.766667,0.866667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.745455,0.711392,0.7625,0.758462,0.689796,0.682353,0.705025,0.758242,0.70068,0.748169,0.6125,0.714815,0.713064,0.719588,0.635762,0.669231,0.76,0.730827,0.742857
607,0.72,0.65,0,0.833333,0.684211,0.8,0.822951,0.929412,0.693333,0.703448,1,0.767575,0.665455,0.714286,0.744444,0.822857,0,0.762963,0.666667
608,0.551515,0.659281,0.75,0.715789,0.492045,0.527273,0.707336,0.710145,0.644199,0.577358,0.8,0.632353,0.547324,0.6,0.666065,0.663918,0.6,0.722603,0.623636
609,0,0.6,0,0.7,0.6,0.8,0.657143,0,0.64,0.64,0.6,0.616099,0.657143,0.6,0.618182,0.7,0.6,0.7,0.6


## Movie Co-occurance Matrix (With Jaccard Similarity)

In [17]:
movie_matrix = pd.DataFrame(columns = movies.movieId, index = movies.movieId)

In [18]:
dff = ratings.copy()
dff =  ratings.groupby(['movieId'])['userId'].unique()
dff

movieId
1         [1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...
2         [6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,...
3         [1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,...
4                           [6, 14, 84, 162, 262, 411, 600]
5         [6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,...
                                ...                        
193581                                                [184]
193583                                                [184]
193585                                                [184]
193587                                                [184]
193609                                                [331]
Name: userId, Length: 9724, dtype: object

In [19]:
'''for i in movie_matrix.index:
    for j in movie_matrix.index:
        try:
            print(i, j)
            movie_matrix[str(i)][j] = round(len(set(dff[i]).intersection(set(dff[j])))/len(set(dff[i]).union(set(dff[j]))), 2)
        except KeyError:
            movie_matrix[str(i)][j] = 0.0'''

'for i in movie_matrix.index:\n    for j in movie_matrix.index:\n        try:\n            print(i, j)\n            movie_matrix[str(i)][j] = round(len(set(dff[i]).intersection(set(dff[j])))/len(set(dff[i]).union(set(dff[j]))), 2)\n        except KeyError:\n            movie_matrix[str(i)][j] = 0.0'

In [20]:
movie_matrix = pd.read_csv('movie_matrix_final.csv')

In [21]:
movie_matrix.set_index(movies.movieId, inplace = True)

In [22]:
movie_matrix

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193585,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## Movie-Genre Matrix

In [23]:
movies_temp = movies.copy()

In [24]:
movies_temp = movies_temp.set_index('movieId')

In [25]:
movies_temp

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [26]:
movie_genre = pd.DataFrame(columns = genres, index = movies.movieId)

In [27]:
movie_genre[genres] = 0

In [28]:
for i in movies.movieId:
    for j in movies_temp['genres'][i].split('|'):
        try:
            movie_genre[j][i] = 1
        except KeyError:
            pass

In [29]:
movie_genre

Unnamed: 0_level_0,Musical,Sci-Fi,Film-Noir,War,Children,Western,Thriller,Mystery,Adventure,Romance,IMAX,Drama,Comedy,Fantasy,Action,Horror,Documentary,Crime,Animation
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1
2,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1
193583,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1
193585,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
193587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


##### Predicting Top 10 Movies For each Users

In [30]:
user_movie_pred = {}
for i in users:
    user_movie_pred[i] = list(pd.DataFrame(map(float, movie_genre.dot(usergenre_matrix.loc[i]))).nlargest(columns = 0, n = 10).index)

In [31]:
user_movie_pred

{1: [7441, 5556, 1390, 6626, 2250, 3460, 6462, 7170, 4631, 6145],
 2: [7441, 7372, 5476, 5774, 3608, 7170, 400, 6358, 3460, 7770],
 3: [5802, 6145, 5980, 7441, 5593, 2354, 4631, 6462, 2869, 1972],
 4: [7441, 5556, 6626, 2250, 1390, 7467, 3460, 5819, 6462, 6145],
 5: [7441, 7467, 2250, 6626, 1390, 5819, 3460, 5556, 6462, 4631],
 6: [7441, 5556, 7467, 7550, 5819, 6626, 1390, 6462, 7372, 2250],
 7: [7441, 5556, 6145, 2250, 6462, 6626, 1390, 4631, 3460, 7550],
 8: [7441, 7467, 1390, 6626, 2250, 7550, 5819, 5556, 7372, 3460],
 9: [7441, 2250, 5556, 6626, 6462, 5819, 7467, 4631, 1390, 7550],
 10: [6626, 7467, 1390, 7441, 7550, 5819, 2250, 3460, 6462, 7170],
 11: [7441, 7372, 6145, 7170, 4631, 6797, 5980, 4843, 5673, 5476],
 12: [7170, 3460, 5476, 4631, 3608, 9394, 8597, 6267, 9358, 400],
 13: [7441, 6145, 7170, 4631, 5980, 5802, 7770, 167, 5556, 7372],
 14: [7441, 7372, 6145, 6797, 5980, 4631, 7170, 5203, 5673, 167],
 15: [7441, 7372, 3460, 6145, 7170, 5556, 4631, 6462, 1390, 7550],
 16: [74

In [32]:
pd.DataFrame(map(float, movie_genre.dot(usergenre_matrix.loc[1]))).nlargest(columns = 0, n = 10)

Unnamed: 0,0
7441,8.571582
5556,7.070184
1390,6.267699
6626,6.238
2250,6.144543
3460,6.108811
6462,6.073352
7170,6.026569
4631,5.989341
6145,5.814834


In [33]:
max(list(movie_genre.dot(usergenre_matrix.loc[1])))

8.57158240415338