## Advanced Recommendation Systems

In [1]:
# Written by Tyler Beetle for Python for Data Science and Machine Learning Bootcamp Udemy Course.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']

In [4]:
df = pd.read_csv('../data/u.data', sep='\t', names=column_names)

In [5]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [6]:
movie_titles = pd.read_csv('../data/Movie_Id_Titles.txt')
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [8]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title_x,title_y
0,0,50,5,881250949,Star Wars (1977),Star Wars (1977)
1,0,172,5,881250949,"Empire Strikes Back, The (1980)","Empire Strikes Back, The (1980)"
2,0,133,1,881250949,Gone with the Wind (1939),Gone with the Wind (1939)
3,196,242,3,881250949,Kolya (1996),Kolya (1996)
4,186,302,3,891717742,L.A. Confidential (1997),L.A. Confidential (1997)


In [11]:
num_users = df['user_id'].nunique()
num_movies = df['item_id'].nunique()

In [12]:
print('User count:', num_users)
print('Movie count:', num_movies)

User count: 944
Movie count: 1682


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test = train_test_split(df, random_state=42)

In [18]:
train_data_matrix = np.zeros((num_users, num_movies))
test_data_matrix = np.zeros((num_users, num_movies))

for row in X_train.itertuples():
    train_data_matrix[row[1]-1, row[2]-1] = row[3]
for row in X_test.itertuples():
    test_data_matrix[row[1]-1, row[2]-1] = row[3]

In [21]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [30]:
def prediction(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'movie':
        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [36]:
movie_pred = prediction (train_data_matrix,movie_similarity, type='movie')
user_pred = prediction (train_data_matrix,user_similarity, type='user')

In [37]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [38]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [39]:
print('User based RMSE:', str(rmse(user_pred, test_data_matrix)))
print('Movie based RMSE:', str(rmse(movie_pred, test_data_matrix)))

User based RMSE: 3.12915767349857
Movie based RMSE: 3.4570185026548415


In [43]:
sparsity=round(1.0-len(df)/float(num_users*num_movies), 3)
print('The sparsity level of MovieLens100K:', str(sparsity*100) + '%')

The sparsity level of MovieLens100K: 93.7%


In [45]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [47]:
u, s, vt = svds(train_data_matrix, k =20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User based CF MSE:', str(rmse(X_pred, test_data_matrix)))

User based CF MSE: 2.732271803634331
