In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import scipy.sparse as sps
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import surprise as sp
import os
from sklearn.preprocessing import normalize
import cmath
from numpy.linalg import norm
import surprise as sp
from sklearn.metrics.pairwise import cosine_similarity
import math

# Input data files are available in the "../input/" directory.
# For example, running this will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

def read_data(filename):
    dataset = pd.read_csv(filename)
    return dataset

#Evaluation Metrics

#Precision
def get_precision(hits,k,test):
    precision = hits/k
    P_c = precision[test.sort_values('visitorid').visitorid.unique()]
    no_of_test_users = len(test.visitorid.unique())
    avg_precision = sum(P_c)/no_of_test_users
    print(avg_precision)
    return avg_precision,P_c

#Recall
def get_recall(hits,test):
    no_interactions = list(test.sort_values('visitorid').groupby('visitorid').count().event)
    recall = hits[list(test.sort_values('visitorid').visitorid.unique())]/no_interactions
    R_c = recall
    no_of_test_users = len(test.visitorid.unique())
    avg_recall = sum(R_c)/no_of_test_users
    print(avg_recall)
    return avg_recall, R_c

#F-measure
def get_fmeasure(P_c,R_c):
    F = 2*P_c*R_c/(P_c+R_c)
    nan_index = np.isnan(F)
    F[nan_index] = 0
    avg_F = sum(F)/len(F)
    print(avg_F)
    return avg_F


category_tree.csv
events.csv
item_properties_part1.csv
item_properties_part2.csv
rating_matrix.csv



In [None]:
# Read the dataset
category_tree = read_data("../input/category_tree.csv")
events = read_data("../input/events.csv")
item_1 = read_data("../input/item_properties_part1.csv")
item_2 = read_data("../input/item_properties_part2.csv")

print("Read all data in pandas object complete")

In [13]:
# PRE PROCESSING
# combine item matrix
item_properties = pd.concat(([item_1, item_2]), ignore_index=True)
events_full = events
test, train = pd.DataFrame([]), pd.DataFrame([])
n = events.shape[0]

# use only transaction data for now
events = events[events.event=='transaction']
events = events.sort_values('timestamp')
events = events[events.groupby('visitorid').visitorid.transform(len) > 1]

users = sorted(events.visitorid.unique())
items = sorted(item_properties.itemid.unique())
groupby_users = events.groupby('visitorid')

# Storing number of transactions for each user
counts = events.groupby('visitorid').count()['timestamp'].tolist()
ind = 0

# Iterate over all users with sorted transaction history and put top 80% in train and the latter transactions in test
for name, group in events.groupby('visitorid'):
    train_size = int(math.ceil(0.7*n))
    test_size = int(math.floor(0.3*n))
    if test_size == 0:
        test_size = 1
    train = train.append(group.head(train_size))
    test = test.append(group.tail(test_size))
    ind +=1
print('Train and test constructed with sizes',train.shape[0],test.shape[0])

# create a user-item interaction matrix
rows, cols = max(events.visitorid.unique()), max(item_properties.itemid.unique())
print(rows,cols)
ui_matrix = sps.lil_matrix((rows+2, cols+2), dtype=int) # empty matrix
sampled_train = sps.lil_matrix((rows+2, cols+2), dtype=int) # empty matrix

# Create a confidence matrix with binary values, 1 for transaction and 0 for none.
for index, row in train.iterrows():
    if row['event'] == 'transaction': #remove this to add views and add to cart event
        ui_matrix[row['visitorid'],row['itemid']] = 1
sample = []
row_sums = ui_matrix[users,:].sum(axis=1)

# Consider only those users who have 5 or more purchases.
sample = [i for i in range(len(row_sums)) if row_sums[i] >=5]
sampled_train = []
for user_index in sample:
    sampled_train.append(list(ui_matrix[users[user_index],:].A[0]))
print('Sampled_train constructed, contains item purchases (5+ items per row)')


('Train and test constructed with sizes', 13314, 13314)
(1406087, 466866)
here 1
here 2
Sampled_train constructed, contains item purchases (5+ items per row)


In [5]:
# Get ground truth using future transactions that exist in test
# Storing a mapping between the total users in the sparse matrix and the selected subsample (5 transcations)
user_dict = {}
for i,v in enumerate(sample):
    if users[v] not in user_dict:
        user_dict[users[v]]=i
print('Dict done')


Dict done


In [15]:
# LINK ANALYSIS
print('Commence link analysis')

gamma = 0.9
A = sampled_train
B = []
lol = []


for i,user in enumerate(sample):
    B.append(A[i]/row_sums[user,0]**gamma)
A = np.array(A)
B = np.array(B)
CR_prev=np.identity(len(sample))
CR_0=CR_prev
counter = 0
print('Begin loop')
while(True):
    PR = np.dot(A.T,CR_prev)
    CR_new = np.dot(B,PR)
    r_s = CR_new.sum(axis=1)
    new_matrix = CR_new / r_s[:, np.newaxis]   
    CR_new = new_matrix+CR_0
    if(np.linalg.norm(CR_new-CR_prev))<=10: #Lower the difference, more accurate results, time to run increases
        break
    CR_prev = CR_new
    counter += 1 # How many iterations took place

print('Link Analysis Done')
PR=PR.T
#Get recommendations
rec = []
k = 5
#PR -> NxM (M-users, N-items) 
#Get top k items columnwise
for arr in PR:
    rec.append(arr.argsort()[-k:][::-1])


Commence link analysis
Begin loop
Link Analysis Done


In [16]:
# Hits for link analysis
test_users = test['visitorid']
hits = np.zeros((max(users))+1)
for ind,row in test.iterrows():
    tu = row['visitorid']
    if tu in user_dict:
        algo_rec = rec[user_dict[tu]]
        if row['itemid'] in algo_rec:
            hits[tu]+=1
print("Results for link analysis")
avg_precision,P_c = get_precision(hits,k,test)
avg_recall, R_c = get_recall(hits,test)
avg_F = get_fmeasure(P_c,R_c)

Results for link analysis
0.144099378882
0.0754860656057
0.0894631818492




In [None]:
#User Based
A = sampled_train
WC = cosine_similarity(A)
user_prod = WC.dot(A)
user_rec = []
k = 5
#PR -> NxM (M-users, N-items) 
#Get top k items columnwise
for arr in user_prod:
    user_rec.append(arr.argsort()[-k:][::-1])

test_users = test['visitorid']
user_hits = np.zeros((max(users))+1)
for ind,row in test.iterrows():
    tu = row['visitorid']
    if tu in user_dict:
        algo_rec = user_rec[user_dict[tu]]
        if row['itemid'] in algo_rec:
            user_hits[tu]+=1

avg_precision,P_c = get_precision(user_hits,k,test)
avg_recall, R_c = get_recall(user_hits,test)
avg_F = get_fmeasure(P_c,R_c)

In [11]:
A = sampled_train
A =  sps.lil_matrix(np.array(A).T)
WP = cosine_similarity(A,dense_output=False)
WP.shape[1]

466868

In [None]:
#Item Based
A_org = np.array(sampled_train)
A_T =  sps.lil_matrix((A_org).T)
WP = cosine_similarity(A_T,dense_output=False)
item_prod = A_org.dot(WP)
item_rec = []
k = 5
#PR -> NxM (M-users, N-items) 
#Get top k items columnwise
for arr in user_prod:
    item_rec.append(arr.argsort()[-k:][::-1])

test_users = test['visitorid']
item_hits = np.zeros((max(users))+1)
for ind,row in test.iterrows():
    tu = row['visitorid']
    if tu in user_dict:
        algo_rec = item_rec[user_dict[tu]]
        if row['itemid'] in algo_rec:
            item_hits[tu]+=1

avg_precision,P_c = get_precision(item_hits,k,test)
avg_recall, R_c = get_recall(item_hits,test)
avg_F = get_fmeasure(P_c,R_c)

In [None]:
len(hits)

len(events.visitorid.unique()),len(test.visitorid.unique())

len(users)

len(train.visitorid.unique())

In [None]:
#Check sparsity of the matrix
sparsity = 100*(1 - float( len(C) )/(len(events.visitorid.unique())*len(item_properties.itemid.unique())))
sparsity
#len(R)