In [4]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import accuracy
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import KFold
from surprise import CoClustering
from collections import deque
import random
import pickle

In [9]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 3 GPU(s) available.
We will use the GPU: NVIDIA A30


In [10]:
def summary_report(data):
    r, c = data.shape
    t_sparce = 0
    t_nunique = 0
    for n in data.columns:
        col = data[n]
        sparce = round(round(col.isna().sum() / r, 2) * 100, 2)
        nunique = col.nunique()
        
        t_sparce = t_sparce + sparce
        t_nunique = t_nunique + nunique
        
        print(str(n) + " is " + str(sparce) + "% empty. (" + str(data[n].isna().sum()) + " null values)")
        print(f"The column has " + str(nunique) + " unique elements.")
        print("\n")
    
    data.info()
    print(f"The shape of the dataset is " + str(r) + " rows and " + str(c) + " columns.")
    print(f"The dataset is " + str(round(c / t_sparce, 2)) + "% dense and contains " + str(t_nunique) + " unique elements." )

In [11]:
movie_titles = pd.read_csv('movie_titles.csv', encoding = 'ISO-8859-1', header = None, names = ['Id', 'Year', 'Name']).set_index('Id')

In [12]:
df1 = pd.read_csv('combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1], low_memory=False)
df2 = pd.read_csv('combined_data_2.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1], low_memory=False)
df3 = pd.read_csv('combined_data_3.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1], low_memory=False)
df4 = pd.read_csv('combined_data_4.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1], low_memory=False)

df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

print('Dataset 1 shape: {}'.format(df1.shape))
print('Dataset 2 shape: {}'.format(df2.shape))
print('Dataset 3 shape: {}'.format(df3.shape))
print('Dataset 4 shape: {}'.format(df4.shape))

frames = [df1, df2, df3, df4]
df = pd.concat(frames)

df.index = np.arange(0,len(df))
df = df[1:len(df)]
tmp_movies = df[df['Rating'].isna()]['Cust_Id'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)

user_data = []

for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    if df_id_1<df_id_2:
        tmp_df = df.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df.loc[df_id_1+1:].copy()
        
    tmp_df['Movie'] = movie_id
    user_data.append(tmp_df)

rating = pd.concat(user_data)
del user_data, df, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape Cust_Id-Ratings:\t{}'.format(rating.shape))
rating.sample(5)

Dataset 1 shape: (24058263, 2)
Dataset 2 shape: (26982302, 2)
Dataset 3 shape: (22605786, 2)
Dataset 4 shape: (26851926, 2)
Shape Cust_Id-Ratings:	(100479960, 3)


Unnamed: 0,Cust_Id,Rating,Movie
85310698,2415196,4.0,15132
35737473,2431797,3.0,6362
89674938,21677,5.0,15922
28293428,857406,4.0,5226
2248272,2136889,3.0,424


In [13]:
ratings = rating[['Cust_Id','Movie','Rating']]
ratings.columns = ['userId','movieId','rating']
ratings_dict = {'itemID': list(ratings.movieId),
                'userID': list(ratings.userId),
                'rating': list(ratings.rating)}

df = pd.DataFrame(ratings_dict)
df.shape

(100479960, 3)

In [14]:
customers = df.userID
ratings_count = dict()
for customer in customers:
    if customer in ratings_count:
        ratings_count[customer] += 1
    else:
        ratings_count[customer] = 1

In [15]:
lower = 10
upper = 400
remove_customer = []
for customer, num_ratings in ratings_count.items():
    if num_ratings < lower or num_ratings > upper:
        remove_customer.append(customer)
df = df.loc[~df.userID.isin(remove_customer)]

In [16]:
customers_edit = df.userID
ratings_count_edit = dict()
for customer in customers_edit:
    if customer in ratings_count_edit:
        ratings_count_edit[customer] += 1
    else:
        ratings_count_edit[customer] = 1

In [17]:
movies = df.itemID
ratings_count = dict()
for movie in movies:
    if movie in ratings_count:
        ratings_count[movie] += 1
    else:
        ratings_count[movie] = 1

In [18]:
lower = 10
upper = 10000
remove_item = []
for item, num_ratings in ratings_count.items():
    if num_ratings < lower or num_ratings > upper:
        remove_item.append(item)
df = df.loc[~df.itemID.isin(remove_item)]

In [19]:
movies_edit = df.itemID
ratings_count_edit = dict()
for movie in movies_edit:
    if movie in ratings_count_edit:
        ratings_count_edit[movie] += 1
    else:
        ratings_count_edit[movie] = 1

In [20]:
summary_report(df)

itemID is 0.0% empty. (0 null values)
The column has 16801 unique elements.


userID is 0.0% empty. (0 null values)
The column has 375898 unique elements.


rating is 0.0% empty. (0 null values)
The column has 5 unique elements.


<class 'pandas.core.frame.DataFrame'>
Index: 12669476 entries, 0 to 100479953
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   itemID  int64  
 1   userID  object 
 2   rating  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 386.6+ MB
The shape of the dataset is 12669476 rows and 3 columns.
The dataset is inf% dense and contains 392704 unique elements.


In [21]:
reader = Reader(rating_scale = (0,5))
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
trainset, testset = train_test_split(data, test_size=.2, random_state=42) # Take test_size=0.4

In [26]:
def precision_recall_at_k(model, k=30, threshold=1.5):
    user_est_true = defaultdict(list)
    predictions=model.test(testset)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)),3)
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)),3)
    
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F_1 score: ', round((2*precision*recall)/(precision+recall),3))

In [24]:
filename = 'user_user_model.pkl'
sim_user_user = pickle.load(open(filename, 'rb'))

In [27]:
precision_recall_at_k(sim_user_user)

RMSE: 0.9423
MAE:  0.7374
Precision:  0.934
Recall:  0.98
F_1 score:  0.956


In [28]:
filename = 'svd_model.pkl'
svd = pickle.load(open(filename, 'rb'))

In [29]:
precision_recall_at_k(svd)

RMSE: 0.6865
MAE:  0.5331
Precision:  0.939
Recall:  0.98
F_1 score:  0.959


In [30]:
filename = 'cocluster_model.pkl'
CoCluster = pickle.load(open(filename, 'rb'))

In [31]:
precision_recall_at_k(CoCluster)

RMSE: 0.9475
MAE:  0.7325
Precision:  0.934
Recall:  0.975
F_1 score:  0.954
