In [1]:
import pandas as pd
import numpy as np

In [2]:
# train_data = pd.read_csv('//Users//bagades//Desktop//RS//serendipity-sac2018//training.csv')
train_data = pd.read_csv('../../serendipity-sac2018/training.csv')

train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,142882,91658,2.5,1515209647000
1,142882,4344,1.0,1515209646000
2,142882,45720,2.0,1515209643000
3,142882,4734,2.0,1515209641000
4,142882,91542,2.0,1515209637000


In [3]:
print('The dataset train has {:,} rows and {:,} columns'.format(train_data.shape[0],train_data.shape[1]))

print('Number of unqiue userIDs are {:,} and number of movies are {:,}'.format(len(train_data['userId'].unique()),
                                                                     len(train_data['movieId'].unique())))

from datetime import datetime
train_data['timestamp'] = train_data['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))

train_data['year'] = train_data['timestamp'].apply(lambda x: x.year)
train_data['month'] = train_data['timestamp'].apply(lambda x: x.month)

The dataset train has 9,997,850 rows and 4 columns
Number of unqiue userIDs are 104,661 and number of movies are 49,151


---
**Filtering the data by year <br>
Considering years {2016, 2017 and 2018}**

In [4]:
print('The data will reduce by {} % if we filter for only year 2017'.format(
    round(len(train_data[train_data['year'] != 2017])*100/len(train_data),3)))

train_filtered = train_data[train_data['year'] == 2017]

The data will reduce by 80.064 % if we filter for only year 2017


In [5]:
print('The filtered data has {:,} ratings from {:,} ratings \nThe movies are reduced to {:,} from {:,} and \nThe users have reduced to {:,} from {:,}'.format(train_filtered.shape[0], train_data.shape[0],
len(train_filtered['movieId'].unique()),len(train_data['movieId'].unique()),
len(train_filtered['userId'].unique()),len(train_data['userId'].unique())   ))

The filtered data has 1,993,188 ratings from 9,997,850 ratings 
The movies are reduced to 36,536 from 49,151 and 
The users have reduced to 19,657 from 104,661


---

In [6]:
UM_matrix = np.matrix(train_filtered.pivot(index='userId', columns='movieId', values='rating'))

In [7]:
UM_matrix.shape

(19657, 36536)

In [8]:
UM_matrix[19656,36535]

nan

In [9]:
UM_matrix_sub = UM_matrix[1:1000,1:100]

In [10]:
from CollabFiltering import SVD as svd

In [11]:
output01 = svd(UM_matrix_sub)

ValueError: shapes (999,10) and (99,10) not aligned: 10 (dim 1) != 99 (dim 0)

In [None]:
output

---
## KPI's ( for surprise )

In [7]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [8]:
def MAR(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(recall_score(est,act))
        
    return(np.average(All_recalls))

In [9]:
def MAP(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_recalls = list()
    for uid in Top_5.keys():

        df = pd.DataFrame(Top_5[uid])
        est = [1 if x > 3.5 else 0 for x in df[1] ]
        act = [1 if x > 3.5 else 0 for x in df[2] ]
        All_recalls.append(precision_score(est,act))
        
    return(np.average(All_recalls))

In [10]:

def ADCG(predictions, top_n = 5):

    Top_5 = get_top_n(predictions,top_n)
    All_DCG = list()
    for uid in Top_5.keys():
        
        df = pd.DataFrame(Top_5[uid])
        DCG = list()
        for i in range(len(df)):
            if i ==0:
                DCG.append(df[2][i])
            else:
                DCG.append(df[2][i]/np.log2(i+1))
        
        
        All_DCG.append(sum(DCG))
        
    return(np.average(All_DCG))

-----

## Top predictions ( for surprise )

In [11]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, est,r_ui,_ in predictions:
        top_n[uid].append((iid, est, r_ui))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

---

# Using surprise package

**Collaborative Filtering**

In [12]:
from surprise import Reader, Dataset,accuracy
import random

In [13]:
reader = Reader(rating_scale=(0.5, 5.0))

In [14]:
cf_matrix  = Dataset.load_from_df(train_filtered[['userId','movieId','rating']], reader)

In [15]:
random.seed(9999)

In [16]:
from surprise.model_selection import train_test_split
train, test = train_test_split(cf_matrix, test_size= .2)

## Building a KNN model

In [17]:
from surprise import KNNBasic
Knn_cf = KNNBasic(sim_options={'user_based':True})

In [18]:
Knn_cf.fit(train)
predictions_knn = Knn_cf.test(test)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [19]:
accuracy.rmse(predictions_knn)

RMSE: 0.9066


0.9065819286848903

In [20]:
top_pred_knn = get_top_n(predictions_knn)

In [21]:
top_pred_knn

defaultdict(list,
            {141900: [(54503, 5.0, 3.999245493812584),
              (34405, 5.0, 4.310285332615162),
              (2997, 5.0, 4.23921094040358),
              (480, 5.0, 3.922594874679557),
              (1278, 5.0, 4.5447934127062535),
              (7099, 5.0, 4.628562239850793),
              (1246, 5.0, 4.484013636761981),
              (65261, 5.0, 4.2763095695624775),
              (162606, 5.0, 3.762786709952013),
              (41569, 5.0, 3.2071365620425336)],
             135280: [(122886, 5.0, 4.102151027127655),
              (88129, 5.0, 4.1034796301832035),
              (152081, 5.0, 3.933290835318882),
              (589, 5.0, 4.53806946051565),
              (122918, 5.0, 4.162984595047379),
              (2571, 5.0, 4.5375),
              (108932, 5.0, 4.045614701579621),
              (8368, 5.0, 4.48454046681964),
              (1240, 4.5, 4.440963477402945),
              (162350, 4.5, 3.6196591174427413)],
             142032: [(1225, 5.0, 4.36