In [1]:
import pandas as pd
import numpy as np
import imp

In [4]:
train_data = pd.read_csv('../../data/serendipity-sac2018/training.csv')

train_data.head()

In [12]:
print('The dataset train has {:,} rows and {:,} columns'.format(train_data.shape[0],train_data.shape[1]))

print('Number of unqiue userIDs are {:,} and number of movies are {:,}'.format(len(train_data['userId'].unique()),
                                                                     len(train_data['movieId'].unique())))

from datetime import datetime
train_data['timestamp'] = train_data['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))

train_data['year'] = train_data['timestamp'].apply(lambda x: x.year)
train_data['month'] = train_data['timestamp'].apply(lambda x: x.month)

The dataset train has 9,997,850 rows and 4 columns
Number of unqiue userIDs are 104,661 and number of movies are 49,151


---
**Filtering the data by year <br>
Considering years {2016, 2017 and 2018}**

In [4]:
print('The data will reduce by {} % if we filter for only year 2017'.format(
    round(len(train_data[train_data['year'] != 2017])*100/len(train_data),3)))

train_filtered = train_data[train_data['year'] == 2017]

The data will reduce by 80.064 % if we filter for only year 2017


In [5]:
print('The filtered data has {:,} ratings from {:,} ratings \nThe movies are reduced to {:,} from {:,} and \nThe users have reduced to {:,} from {:,}'.format(train_filtered.shape[0], train_data.shape[0],
len(train_filtered['movieId'].unique()),len(train_data['movieId'].unique()),
len(train_filtered['userId'].unique()),len(train_data['userId'].unique())   ))

The filtered data has 1,993,188 ratings from 9,997,850 ratings 
The movies are reduced to 36,536 from 49,151 and 
The users have reduced to 19,657 from 104,661


---

In [5]:
UM_matrix = np.matrix(train_filtered.pivot(index='userId', columns='movieId', values='rating'))

In [79]:
UM_matrix.shape

(19657, 36536)

In [6]:
UM_matrix_sub = UM_matrix[1:500,1:15]

In [7]:
np.argwhere(np.isnan(UM_matrix_sub)) 

array([[  0,   0],
       [  0,   1],
       [  0,   2],
       ...,
       [498,  11],
       [498,  12],
       [498,  13]], dtype=int64)

## Modeling using the SVD function 

In [2]:
train_subset = pd.read_csv('../Intermediate_data/train_subset.csv',index_col= 0)

In [3]:
movies_filtered = pd.read_csv('../Intermediate_data/filtered_movies_genre.csv',index_col= 0)

In [4]:
train_new = train_subset.merge(movies_filtered, how = 'inner', left_on = 'movieId', right_on = 'movieId' )

In [5]:
UM= train_new.pivot(index='userId', columns='movieId', values='rating')
UM_matrix = np.matrix(UM.values)

In [6]:
userId_unique = list(train_subset['userId'].unique())
movieId_unique = list(train_subset['movieId'].unique())

userId_unique.index(127137), movieId_unique.index(71057)

(0, 3)

In [14]:
from CollabFiltering import SVD as svd

In [7]:
%load_ext autoreload
%autoreload 2

In [15]:
p,q = svd(UM_matrix,hiddenk = 20, epoch = 10)

In [None]:
p

In [None]:
q

In [16]:
UM_matrix_pred = np.dot(p,q.T)

In [19]:
def rmse(UM, UM_pred):

    subt = np.subtract(UM,UM_pred)
    RMSE = np.sqrt(np.square(subt[~np.isnan(subt)]).mean().mean())

    return(RMSE)

In [20]:
RMSE_svd = rmse(UM_matrix,UM_matrix_pred)

In [21]:
RMSE_svd

1.12583421346865

In [24]:
user_Ids = UM.index
movie_Ids = UM.columns

In [33]:
user_Ids

Int64Index([100032, 100036, 100053, 100057, 100058, 100067, 100076, 100093,
            100119, 100143,
            ...
            206902, 206903, 206905, 206916, 206921, 206948, 206951, 206968,
            206981, 206984],
           dtype='int64', name='userId', length=19203)

In [41]:
top_n_predictn_all = {}
for i in range(UM_matrix_pred.shape[0]):
    
    predictn_on_unseen = UM_matrix_pred[i][UM.ix[user_Ids[i]].isna()]
    unseen_movies = movie_Ids[UM.ix[user_Ids[i]].isna()]

    recommendations = pd.DataFrame({ 'unseen_movies': unseen_movies,
                        'prediction' :  predictn_on_unseen})

    recommendations = recommendations.sort_values(by='prediction',          ascending=False)
    top_20 = recommendations[:20]

    top_n_predictn_all[str(user_Ids[i])] = {'unseen_movies':top_20['unseen_movies'].tolist(),'prediction' :  top_20['prediction'].tolist() }

In [39]:
top_n_predictn_all['206903']

{'unseen_movies': 2737       6428
 9989     119218
 4211      27869
 6315      72142
 61          114
 9353     111778
 420         940
 4628      39183
 1454       3429
 8987     107410
 14112    164917
 107         230
 9154     109370
 9773     116797
 1909       4506
 931        2176
 7764      93457
 8247      99214
 12015    142424
 14497    167990
 Name: unseen_movies, dtype: int64, 'prediction': 2737     7.459418
 9989     6.659542
 4211     6.632183
 6315     6.394575
 61       6.382227
 9353     6.352705
 420      6.281009
 4628     6.280025
 1454     6.242516
 8987     6.194366
 14112    6.180800
 107      6.155717
 9154     6.139015
 9773     6.134705
 1909     6.122736
 931      6.121447
 7764     6.119085
 8247     6.118456
 12015    6.118084
 14497    6.074558
 Name: prediction, dtype: float64}

In [44]:
import json

json = json.dumps(top_n_predictn_all)
f = open("../Intermediate_data/top_20_predictn_svd.json","w")
f.write(json)
f.close()

---
## KPI's ( for surprise )

In [25]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [26]:
def MAR(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(recall_score(est,act))
        
    return(np.average(All_recalls))

In [27]:
def MAP(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(precision_score(est,act))
        
    return(np.average(All_recalls))

In [28]:
def ADCG(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_DCG = list()
    for uid in Top_5.keys():
        
        df = pd.DataFrame(Top_5[uid])
        DCG = list()
        for i in range(len(df)):
            if i ==0:
                DCG.append(df[2][i])
            else:
                DCG.append(df[2][i]/np.log2(i+1))
        
        
        All_DCG.append(sum(DCG))
        
    return(np.average(All_DCG))

-----

## Top predictions ( for surprise )

In [54]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, r_ui, est, _ in predictions:
        top_n[uid].append((iid, est, r_ui))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        # print(user_ratings)
        user_ratings = sorted(user_ratings,key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

---

# Using surprise package

**Collaborative Filtering**

In [31]:
from surprise import Reader, Dataset,accuracy
import random

In [62]:
reader = Reader(rating_scale=(0.0, 5.0))

In [63]:
cf_matrix  = Dataset.load_from_df(train_subset[['userId','movieId','rating']], reader)

In [64]:
random.seed(9999)

In [65]:
from surprise.model_selection import train_test_split
train, test = train_test_split(cf_matrix, test_size= .2)

## Building a KNN model

In [35]:
from surprise import KNNBasic
Knn_cf = KNNBasic(sim_options={'user_based':True})

In [36]:
Knn_cf.fit(train)
predictions_knn = Knn_cf.test(test)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [38]:
accuracy.rmse(predictions_knn)

RMSE: 0.9057


0.9057040030448762

In [55]:
top_pred_knn = get_top_n(predictions_knn,20)

In [142]:
top_pred_knn[100032]

[(5618, 4.4125, 4.0)]