<a href="https://colab.research.google.com/github/Shubhammawa/PredictivEye-Internship/blob/master/Recommendation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing necessary libraries

In [0]:
import numpy as np
import pandas as pd
import time
import datetime

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
!unzip "/content/drive/My Drive/ecommerce-dataset.zip"

Archive:  /content/drive/My Drive/ecommerce-dataset.zip
  inflating: category_tree.csv       
  inflating: events.csv              
  inflating: item_properties_part1.csv  
  inflating: item_properties_part2.csv  


In [0]:
events_df = pd.read_csv('events.csv')
category_tree_df = pd.read_csv('category_tree.csv')
item_properties_1_df = pd.read_csv('item_properties_part1.csv')
item_properties_2_df = pd.read_csv('item_properties_part2.csv')
items_df = pd.concat([item_properties_1_df,item_properties_2_df])

In [3]:
data = events_df
data = data.reindex(columns=['visitorid', 'itemid', 'event', 'timestamp'])
data.head()

Unnamed: 0,visitorid,itemid,event,timestamp
0,257597,355908,view,1433221332117
1,992329,248676,view,1433224214164
2,111016,318965,view,1433221999827
3,483717,253185,view,1433221955914
4,951259,367447,view,1433221337106


Converting events into ratings/importance

In [0]:
data['event'] = data['event'].replace('view', 1)
data['event'] = data['event'].replace('addtocart', 3)
data['event'] = data['event'].replace('transaction', 5)

In [5]:
data.head()

Unnamed: 0,visitorid,itemid,event,timestamp
0,257597,355908,1,1433221332117
1,992329,248676,1,1433224214164
2,111016,318965,1,1433221999827
3,483717,253185,1,1433221955914
4,951259,367447,1,1433221337106


In [6]:
!pip install surprise



In [0]:
from surprise import Reader, Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

In [0]:
ratings_dict = {'itemID': list(data.itemid),
                'userID': list(data.visitorid),
                'rating': list(data.event)}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale= (1,5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [9]:
from surprise import BaselineOnly
kf = KFold(n_splits=3)
algo = BaselineOnly()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Estimating biases using als...
RMSE: 0.4469
MAE:  0.1571
Estimating biases using als...
RMSE: 0.4482
MAE:  0.1572
Estimating biases using als...
RMSE: 0.4470
MAE:  0.1570


In [10]:
kf = KFold(n_splits=3)
algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

RMSE: 0.4506
MAE:  0.1578
RMSE: 0.4497
MAE:  0.1576
RMSE: 0.4523
MAE:  0.1585


In [0]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [12]:
best_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
619942,1248857,449418,1.0,1.0,{'was_impossible': False},1,2,0.0
472334,657412,330647,1.0,1.0,{'was_impossible': False},3,234,0.0
576851,1003686,324803,1.0,1.0,{'was_impossible': False},0,32,0.0
799459,968984,359214,1.0,1.0,{'was_impossible': False},4,83,0.0
799457,1386595,126503,1.0,1.0,{'was_impossible': False},2,21,0.0


In [13]:
worst_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
313900,244756,398004,5.0,1.0,{'was_impossible': False},302,14,4.0
39505,1170959,147053,5.0,1.0,{'was_impossible': False},21,7,4.0
657137,1186376,132074,5.0,1.0,{'was_impossible': False},2,20,4.0
896068,1161163,275339,5.0,1.0,{'was_impossible': False},1090,88,4.0
151221,505455,70314,5.0,1.0,{'was_impossible': False},3,16,4.0


After Testing different algorithms, we fit the model on the complete dataset to have estimated ratings for all user-id pairs.

In [14]:
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f81443bf4a8>

In [0]:
testet = trainset.build_testset
predictions = algo.test(testset)

In [0]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [0]:
top_n = get_top_n(predictions, n=10)

In [0]:
# # Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [0]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [28]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.9999971667421294
0.9921290634362936
0.9999950897270543
0.992282721090217
0.9999967320955601
0.9923065656804972
