In [77]:
# List Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from timeit import default_timer


In [None]:
df_movies = pd.read_csv('../data/ml-20m/movies.csv')
df_movies.head()

In [None]:
df_movies.info()

In [None]:
df_ratings = pd.read_csv('../data/ml-20m/ratings.csv',sep=',',
                         names= ['userId','movieId','rating','timestamp'],
                         dtype= {'userId':np.int32, 'movieId':np.int32, 'rating':np.float64, 'timestamp':np.int64}, 
                         header=0, skipinitialspace=True, error_bad_lines=False, engine='python')
df_ratings.head()

In [62]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId       int32
movieId      int32
rating       float64
timestamp    int64
dtypes: float64(1), int32(2), int64(1)
memory usage: 457.8 MB


In [None]:
# How does the no.of ratings per user vary?

CntRatingsPerUser = df_ratings[['userId', 'movieId']].groupby('userId').count()
CntRatingsPerUser.columns=['no_ratings']

plt.scatter(CntRatingsPerUser.index, CntRatingsPerUser.no_ratings, edgecolor='black')
plt.xlim(0,len(CntRatingsPerUser.index))
plt.ylim(0,)
plt.title('No.Of.Ratings')
plt.xlabel('userId')
plt.ylabel('Number of ratings given')
plt.show()

In [None]:
# Create train, test and validation sets
df_train_ratings, df_test_ratings = train_test_split(df_ratings, test_size=0.2, 
                                                     random_state=1, stratify=df_ratings['userId'])
df_train_ratings, df_validation_ratings = train_test_split(df_train_ratings, test_size=0.2, 
                                                           random_state=1, stratify=df_train_ratings['userId'])
df_train_ratings.info()
df_test_ratings.info()
df_validation_ratings.info()

In [14]:
# Create ratings sparse matrix

lst_unique_users = list(sorted(df_train_ratings.userId.unique()))
lst_unique_movies = list(sorted(df_train_ratings.movieId.unique()))

# Encode to factors
encodedUserId = df_train_ratings.userId.astype(pd.api.types.CategoricalDtype(categories = lst_unique_users)).cat.codes
encodedMovieId = df_train_ratings.movieId.astype(pd.api.types.CategoricalDtype(categories = lst_unique_movies)).cat.codes

lst_ratings = df_train_ratings['rating'].tolist()

mat_ratings = csr_matrix((lst_ratings, (encodedMovieId, encodedUserId)), shape=(len(lst_unique_movies), len(lst_unique_users)))
mat_ratings.shape

(24945, 138493)

In [75]:
# Remove User Bias

mat_ratings = normalize(mat_ratings, norm='l1', copy=False, axis=0)
sum(mat_ratings[:,0].toarray()[:,0])

1.0

In [81]:
mat_movie_similarity = cosine_similarity(mat_ratings)

In [82]:
mat_movie_similarity.shape

(24945, 24945)