# https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

from pandas.api.types import CategoricalDtype

# **Preparing dataset**

In [2]:
# import movie.csv and rating.csv
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv')

In [3]:
ratings.shape, movies.shape

((20000263, 4), (27278, 3))

In [4]:
print(movies.columns)
print(ratings.columns)

Index(['movieId', 'title', 'genres'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [5]:
# drop column 'genres' from movies and 'timestamp' from ratings
movies = movies.drop('genres', axis=1)
ratings = ratings.drop('timestamp', axis=1)

In [6]:
# create pivot table where index = movieId, columns = userId, values = ratings
# use '0' instead of NaN
rcLabel, vLabel = ('movieId', 'userId'), 'rating'
rcCat = [CategoricalDtype(sorted(ratings[col].unique()), ordered=True) for col in rcLabel]
rc = [ratings[column].astype(aType).cat.codes for column, aType in zip(rcLabel, rcCat)]
mat = csr_matrix((ratings[vLabel], rc), shape=tuple(cat.categories.size for cat in rcCat))
dfPivot = ( pd.DataFrame.sparse.from_spmatrix(
    mat, index=rcCat[0].categories, columns=rcCat[1].categories) )

# **Filtering**

In [7]:
# count how many rating make each user(groupby userId) 
users_votes = ratings.groupby('userId')['rating'].agg('count')
 
# count how much rating each movie has(groupby movieId) 
movies_votes = ratings.groupby('movieId')['rating'].agg('count')
 
# make filtering mask and use them
user_votes_mask = users_votes[users_votes > 50].index
movie_votes_mask = movies_votes[movies_votes > 10].index
 
dfPivot = dfPivot.loc[movie_votes_mask,:]
dfPivot = dfPivot.loc[:,user_votes_mask]

In [8]:
# matrix shape after filtration
dfPivot.shape

(15079, 84056)

In [9]:
# make csr matrix(compressed sparse row)
# att. values transfers to csr_matrix func only DataFrame values
csr_data = csr_matrix(dfPivot.values)

In [10]:
dfPivot = dfPivot.rename_axis(None, axis=1).reset_index()
dfPivot.head()

Unnamed: 0,movieId,1,2,3,5,7,8,11,13,14,...,138474,138475,138477,138483,138484,138486,138487,138490,138492,138493
0,1,0.0,0.0,4.0,0.0,0.0,4.0,4.5,4.0,4.5,...,5.0,0.0,3.0,4.0,0.0,5.0,0.0,0.0,0.0,3.5
1,2,3.5,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,...,4.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,4.0
2,3,0.0,4.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Modelling**

In [11]:
# create object knn and fit it by using csr matrix
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn.fit(csr_data)

# **Example of recommendation**

In [12]:
recommendations = 10
search_word = 'Spider-Man'

In [13]:
# movie search in title from table movies
movie_search = movies[movies['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
5252,5349,Spider-Man (2002)
7953,8636,Spider-Man 2 (2004)
11826,52722,Spider-Man 3 (2007)
15085,76709,Spider-Man: The Ultimate Villain Showdown (2002)
19204,95510,"Amazing Spider-Man, The (2012)"
23215,110553,The Amazing Spider-Man 2 (2014)


In [14]:
# took movieId from first row in movie_search
movie_id = movie_search.iloc[0]['movieId']

# search index in dfPivot by using movieId from movies table
movie_id = dfPivot[dfPivot['movieId'] == movie_id].index[0]
movie_id

5182

In [15]:
# get nearest indices and distances in csr matrix by using method kneighbors
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors = recommendations + 1)

In [16]:
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

indices_distances = list(zip(indices_list, distances_list))

In [17]:
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)
 
# drop first element as it is our search movie
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(7768, 0.2894297796567148),
 (4834, 0.3297190588602036),
 (3656, 0.3366138195276729),
 (5766, 0.3375502708243109),
 (5277, 0.34085385441992766),
 (6322, 0.3437572558732621),
 (4159, 0.35086730473939565),
 (6137, 0.351684191144065),
 (4804, 0.3590024710760876),
 (6906, 0.3665289807731361)]

In [18]:
# create an empty list to store movie title
recommendation_list = []
 
# теперь в цикле будем поочередно проходить по кортежам
for ind_dist in indices_distances_sorted:
 
    # get movieId from pivot table
    matrix_movie_id = dfPivot.iloc[ind_dist[0]]['movieId']
 
    # get index from movies table by movieId
    id = movies[movies['movieId'] == matrix_movie_id].index
 
    # get movie title by id
    title = movies.iloc[id]['title'].values[0]
 
    recommendation_list.append({'Title' : title})

In [19]:
recommendation_list

[{'Title': 'Spider-Man 2 (2004)'},
 {'Title': 'Lord of the Rings: The Fellowship of the Ring, The (2001)'},
 {'Title': 'X-Men (2000)'},
 {'Title': 'Lord of the Rings: The Two Towers, The (2002)'},
 {'Title': 'Minority Report (2002)'},
 {'Title': 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)'},
 {'Title': 'Shrek (2001)'},
 {'Title': 'X2: X-Men United (2003)'},
 {'Title': "Ocean's Eleven (2001)"},
 {'Title': 'Lord of the Rings: The Return of the King, The (2003)'}]