In [3]:
import numpy as np
import pandas as pd
#import csv
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from util.data_utils import create_seq_db_filter_top_k, sequences_to_spfm_format
from util.split import last_session_out_split
from util.metrics import precision, recall, mrr
from util import evaluation
from recommenders.FPMCRecommender import FPMCRecommender

In [5]:
import datetime

In [8]:
def get_test_sequences_and_users(test_data, given_k, train_users):
    # we can run evaluation only over sequences longer than abs(LAST_K)
    mask = test_data['sequence'].map(len) > abs(given_k)
    mask &= test_data['user_id'].isin(train_users)
    test_sequences = test_data.loc[mask, 'sequence'].values
    test_users = test_data.loc[mask, 'user_id'].values
    return test_sequences, test_users

# 1. Load the dataset

For this hands-on session we will use a dataset of TaFeng.

In [9]:
dataset_path = r'D:\work\ML_experiment\ta_feng_all_months_merged.csv'
data = pd.read_csv(dataset_path)
data.drop(['AGE_GROUP', 'PIN_CODE', 'PRODUCT_SUBCLASS', 'AMOUNT', 'ASSET', 'SALES_PRICE'], axis=1, inplace=True)
# data.columus = ['ts', 'user_id', 'item_id']

In [10]:
data.rename(columns={'TRANSACTION_DT':'ts', 'CUSTOMER_ID':'user_id', 'PRODUCT_ID':'item_id'}, inplace=True)

In [11]:
data.head()

Unnamed: 0,ts,user_id,item_id
0,11/1/2000,1104905,4710199010372
1,11/1/2000,418683,4710857472535
2,11/1/2000,1057331,4710043654103
3,11/1/2000,1849332,4710126092129
4,11/1/2000,1981995,4710176021445


In [12]:
data['ts'] = pd.to_datetime(data['ts'], format='%m/%d/%Y')

In [13]:
data.sort_values(by=['user_id','ts'], inplace=True)

In [14]:
data.reset_index(drop=True).head()

Unnamed: 0,ts,user_id,item_id
0,2000-11-13,1069,9556439880610
1,2000-11-13,1069,4710176008699
2,2001-01-21,1069,4710320224661
3,2001-01-21,1069,4710022101208
4,2001-01-21,1069,4712603661644


In [15]:
groups = data.groupby(['user_id', 'ts'], as_index=False)

In [16]:
aggregated = groups['item_id'].agg({'sequence': lambda x: list(map(str, x))})

In [18]:
aggregated.head()

Unnamed: 0,user_id,ts,sequence
0,1069,2000-11-13,"[9556439880610, 4710176008699]"
1,1069,2001-01-21,"[4710320224661, 4710022101208, 4712603661644]"
2,1069,2001-02-03,"[4710088620156, 4710176008699, 22000167620, 47..."
3,1069,2001-02-10,[4712162000038]
4,1113,2000-11-12,"[4902105011621, 4711271000014]"


In [17]:
# aggregated.drop(['session_id'], axis=1, inplace=True)

KeyError: "['session_id'] not found in axis"

In [19]:
length = aggregated.size/3

In [20]:
length = length.astype(np.int32)

In [21]:
aggregated['session_id'] = np.arange(length)

In [22]:
dataset = aggregated

In [23]:
dataset.head()

Unnamed: 0,user_id,ts,sequence,session_id
0,1069,2000-11-13,"[9556439880610, 4710176008699]",0
1,1069,2001-01-21,"[4710320224661, 4710022101208, 4712603661644]",1
2,1069,2001-02-03,"[4710088620156, 4710176008699, 22000167620, 47...",2
3,1069,2001-02-10,[4712162000038],3
4,1113,2000-11-12,"[4902105011621, 4711271000014]",4


In [24]:
target_path = r'D:\work\ML_experiment\dataset_ta_feng_all_months_merged.csv'
dataset.to_csv(target_path, sep=',')

# 2. Split the dataset

In [25]:
train_data, test_data = last_session_out_split(dataset)
print("Train sessions: {} - Test sessions: {}".format(len(train_data), len(test_data)))

Train sessions: 87312 - Test sessions: 32266


In [26]:
train_data.head()

Unnamed: 0,user_id,ts,sequence,session_id
0,1069,2000-11-13,"[9556439880610, 4710176008699]",0
1,1069,2001-01-21,"[4710320224661, 4710022101208, 4712603661644]",1
2,1069,2001-02-03,"[4710088620156, 4710176008699, 22000167620, 47...",2
4,1113,2000-11-12,"[4902105011621, 4711271000014]",4
5,1113,2000-11-26,"[4902105011621, 7616100830794, 4710892632017]",5


In [27]:
test_data.head()

Unnamed: 0,user_id,ts,sequence,session_id
3,1069,2001-02-10,[4712162000038],3
7,1113,2001-01-06,"[4710254015014, 4710008251125, 4710254015021, ...",7
9,1250,2001-02-10,"[4710015104841, 4710176001829, 4710176001812, ...",9
10,1359,2000-12-04,"[4710088410139, 4710017008123, 5010415080073]",10
13,1823,2001-01-24,"[78698703015, 4710114128038, 4710126392175]",13


# 3. Fitting the recommender

In [28]:
recommender = FPMCRecommender(n_factor=16,
                              n_epoch=10)
recommender.fit(train_data)

2019-11-14 12:31:33,279 - INFO - epoch 0 done
2019-11-14 12:31:37,670 - INFO - epoch 1 done
2019-11-14 12:31:41,773 - INFO - epoch 2 done
2019-11-14 12:31:45,839 - INFO - epoch 3 done
2019-11-14 12:31:50,857 - INFO - epoch 4 done
2019-11-14 12:31:55,509 - INFO - epoch 5 done
2019-11-14 12:32:03,709 - INFO - epoch 6 done
2019-11-14 12:32:09,264 - INFO - epoch 7 done
2019-11-14 12:32:14,247 - INFO - epoch 8 done
2019-11-14 12:32:19,350 - INFO - epoch 9 done


# 4. Sequential evaluation

In [31]:
GIVEN_K = 1
LOOK_AHEAD = 'all'
STEP=1

In [32]:
METRICS = {'precision':precision, 
           'recall':recall,
           'mrr': mrr}
TOPN = 5 # length of the recommendation list

In [33]:
test_sequences, test_users = get_test_sequences_and_users(test_data, GIVEN_K, train_data['user_id'].values) # we need user ids now!
print('{} sequences available for evaluation ({} users)'.format(len(test_sequences), len(np.unique(test_users))))

17324 sequences available for evaluation (17324 users)


In [34]:
test_sequences

array([list(['4710254015014', '4710008251125', '4710254015021', '37000440147', '4901734003182', '9556001589101', '4710088620750']),
       list(['4710015104841', '4710176001829', '4710176001812', '723125488026', '20480349', '4710085120703', '4710085120628', '4710085150328', '4710085120710', '4710085120093']),
       list(['78698703015', '4710114128038', '4710126392175']), ...,
       list(['8712045000151', '8853301130622', '4710363352000', '4710043004236']),
       list(['4710626111351', '4710943109376', '4710363352000', '4710247007613']),
       list(['4710085120680', '4710085120697'])], dtype=object)

In [None]:
results = evaluation.sequential_evaluation(recommender,
                                           test_sequences=test_sequences,
                                           users=test_users,
                                           given_k=GIVEN_K,
                                           look_ahead=LOOK_AHEAD,
                                           evaluation_functions=METRICS.values(),
                                           top_n=TOPN,
                                           scroll=False,  # scrolling averages metrics over all profile lengths
                                           step=STEP)

  3%|██                                                                          | 459/17324 [10:42<5:57:37,  1.27s/it]