In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# Load dataset
dfpath = os.path.join(os.getcwd(), "data", "filtered_long.csv")
print("Loading file from: {}".format(dfpath))

df = pd.read_csv(dfpath, header=0)
df.session = df.session.map(lambda x: x.strip("()").split(','))
df.head()

Loading file from: D:\GroupAssignment\data\filtered_long.csv


Unnamed: 0.1,Unnamed: 0,session,purchase,len,add
0,0,"[[1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, ...",0,18,2
1,1,"[[1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, ...",0,139,18
2,2,"[[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, ...",0,41,16
3,3,"[[1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 1, ...",0,16,2
4,4,"[[1, 1, 1, 1, 2, 3, 4, 1, 2, 1, 1, ...",1,29,5


In [3]:
# Split set into training and validation set
from sklearn.model_selection import train_test_split

# No real reason for the selected ratio. In PS they used 70/30. Don't forget to set state.
training_set, eval_set = train_test_split(df, test_size=0.2, random_state=123, shuffle=False)
eval_set.head()

Unnamed: 0.1,Unnamed: 0,session,purchase,len,add
129080,129080,"[[1, 1, 2, 1, 1, 2, 3, 1, 1, 1, 1, ...",0,17,6
129081,129081,"[[1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, ...",0,16,14
129082,129082,"[[1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, ...",0,38,19
129083,129083,"[[1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, ...",1,13,5
129084,129084,"[[1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 2, ...",1,20,3


In [21]:
import numpy as np

def ngram_featurizer(session, n):
    
    """takes in a list and an integer defining the size of ngrams.
     Returns the ngrams of desired size in the input string"""
    session = ['#']*(n-1) + session + ['+']*(n-1)
    ngrams = [tuple(session[i:i+n]) for i in range(len(session)-n+1)]
    
    return ngrams
    

def encode_sessions(sessions, n, mapping=None):
    
    """
    Takes in a list of lists, an integer indicating the character ngrams' size,
    and a dictionary mapping ngrams to numerical indices. If no dictionary is passed,
    one is created inside the function.
    The function outputs a 2d NumPy array with as many rows as there are strings in 
    the input list, and the mapping from ngrams to indices, representing the columns 
    of the NumPy array.
    """
    
    if not mapping:
        all_ngrams = set()
        for session in sessions:
            all_ngrams = all_ngrams.union(set(ngram_featurizer(session, n)))
    
        mapping = {ngram: i for i, ngram in enumerate(all_ngrams)}
    
    X = np.zeros((len(sessions), len(mapping)))
    for i, session in enumerate(sessions):
        for ngram in ngram_featurizer(session, n):
            try:
                X[i, mapping[ngram]] += 1
            except KeyError:
                pass
    
    return X, mapping

In [43]:
def subset(raw_set, max_clicks):
    subset = raw_set[(raw_set['len'] - raw_set['add']) > max_clicks]
    subset.sessions = raw_set.apply(lambda x: x.session[:(x['add'] + max_clicks) + 1], axis=1)
    return subset

training_subset = subset(training_set, 15)
eval_subset = subset(eval_set, 15)

  subset.sessions = raw_set.apply(lambda x: x.session[:(x['add'] + max_clicks) + 1], axis=1)


In [44]:
Xtrain, mapping = encode_sessions(training_subset.session, 3)
Xtest, _ = encode_sessions(eval_subset.session, 3, mapping)

In [45]:
# fit NB classifier (Complement)
from sklearn.naive_bayes import ComplementNB

NB = ComplementNB(alpha=1, fit_prior=True)
NB.fit(Xtrain,training_subset.purchase)
bayes_predictions = NB.predict(Xtest)

In [17]:
# fit NB classifier
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB(alpha=1, fit_prior=True)
NB.fit(Xtrain,training_subset.purchase.to_list())
bayes_predictions = NB.predict(Xtest)

In [35]:
from sklearn import metrics

acc = metrics.accuracy_score(eval_subset.purchase.to_list(), bayes_predictions)
acc

0.7052527254707631

In [46]:
f1 = metrics.f1_score(eval_subset.purchase.to_list(), bayes_predictions)
f1

0.5068762278978389

In [37]:
majority = 1 - np.mean(eval_set.purchase.to_list())
majority

0.7642702200185931