## Part 2. Previous model does not care class imbalanced from time division.

Written by Sundong Kim (sundong.kim@kaist.ac.kr), Jan 14th, 2019,

In this notebook, we introduce the description of our more realistic benchmark dataset.

* First, we show the ineffectiveness of our old model.
    * We first generate basic statistical features from each visit.
        * total dwell time
        * the number of area
        * the number of unique areas
    * Then we perform binary classification to predict customer revisit intention.
* Second, we show how to deal with this issue by introducing time-related features, and show its drawbacks.

In [1]:
import pandas as pd
import numpy as np
import os
import random

# pre_release_path = '../data_sample/indoor/store_A/'

In [2]:
# In this tutorial, we will use a store_A dataset.      # /data: 50,000 user dataset  /data_sample: 500 user sample dataset
pre_release_path = '../data/indoor/store_A/'

# Load dataset
train_labels = pd.read_csv(pre_release_path+'train_labels.tsv', sep='\t')
test_labels = pd.read_csv(pre_release_path+'test_labels.tsv', sep='\t')
train_visits = pd.read_csv(pre_release_path+'train_visits.tsv', sep='\t')
test_visits = pd.read_csv(pre_release_path+'test_visits.tsv', sep='\t')
wifi_sessions = pd.read_csv(pre_release_path+'wifi_sessions.tsv', sep='\t')

wifi_sessions = wifi_sessions.set_index('index')

In [3]:
### Before feature engineering, querying some useful information from wifi-sessions data, and add to the dataframe.
import time
def add_infos(df):  
    tst = time.time()
    df['l_index'] = df['indices'].apply(lambda x: [int(y) for y in x.split(';')])
    t1 = time.time()
    print(t1-tst)
    
    newidx = [item for sublist in list(df.l_index) for item in sublist]
    tmpdf = wifi_sessions.loc[newidx]
    traj_lens = df.l_index.apply(len)

    tmp_areas = list(tmpdf['area'])
    tmp_dt = list(tmpdf['dwell_time'])
    tmp_ts_end = list(np.array(tmpdf['ts'])+np.array(tmp_dt))  # end time
    
    rslt_dt = []
    rslt_areas = []
    rslt_ts_end = []
    
    i = 0
    for x in traj_lens:
        rslt_dt.append(tmp_dt[i:i+x])
        rslt_areas.append(tmp_areas[i:i+x])
        rslt_ts_end.append(max(tmp_ts_end[i:i+x]))
        i += x
        
    df['dwell_times'] = rslt_dt
    df['areas'] =  rslt_areas
    df['ts_end'] = rslt_ts_end
    
    t2 = time.time()
    print(t2-t1)
    return df 
    
##### Very slow approach, so revised.    
#     df['dwell_times'] = df['l_index'].apply(lambda x: [wifi_sessions.loc[idx]['dwell_time'] for idx in x])
#     t2 = time.time()
#     print(t2-t1)
#     df['areas'] = df['l_index'].apply(lambda x: [wifi_sessions.loc[idx]['area'] for idx in x])
#     t3 = time.time()
#     print(t3-t2)
    return df

In [4]:
train_visits = add_infos(train_visits)
test_visits = add_infos(test_visits)

0.08652567863464355
0.40163564682006836
0.04873180389404297
0.21306967735290527


In [5]:
train_visits.dwell_times.head(3)

0                                       [134, 42, 134]
1                         [298, 354, 59, 169, 110, 49]
2    [43, 47, 1141, 263, 233, 133, 143, 116, 798, 7...
Name: dwell_times, dtype: object

In [6]:
### Sample code to generate features 

def statistical_feature_generator(x):
    fs = []

    total_dwell_time = sum(x['dwell_times'])   # total dwell time
    num_area_trajectory_have = len(x['dwell_times'])  # the number of area
    num_unique_area_sensed = len(set(x['areas']))  # the number of unique areas
    
    fs.append(total_dwell_time)
    fs.append(num_area_trajectory_have)  
    fs.append(num_unique_area_sensed)     
    
    return fs


def add_statistical_features(train_visits):
    df = train_visits.copy()
    
    features = df.apply(lambda x: statistical_feature_generator(x), axis=1)
    featureName = ['total_dwell_time', 'num_area', 'num_unique_area']
    
    fdf = pd.DataFrame(list(np.asarray(features)), index=features.index, columns = featureName)
    
    # Combine feature values to the dataframe
    df = pd.concat([df, fdf], axis=1)
    del fdf
    
    return df

In [7]:
train_visits = add_statistical_features(train_visits)
test_visits = add_statistical_features(test_visits)

In [8]:
train_visits['date_rel'] = train_visits['date']-min(train_visits.date)
test_visits['date_rel'] = test_visits['date']-min(train_visits.date)

In [9]:
train_visits.head(3)

Unnamed: 0,visit_id,wifi_id,date,indices,l_index,dwell_times,areas,ts_end,total_dwell_time,num_area,num_unique_area,date_rel
0,v0,4,17173,36717;36720;36722,"[36717, 36720, 36722]","[134, 42, 134]","[1f-d, 1f-c, 1f-e]",1483786916,310,3,3,6
1,v1,7,17323,490000;490008;490019;490260;490262;490265,"[490000, 490008, 490019, 490260, 490262, 490265]","[298, 354, 59, 169, 110, 49]","[1f-d, 1f-c, 1f-f, 1f-d, 1f-e, 1f-f]",1496746715,1039,6,4,156
2,v2,8,17201,155980;155981;155992;156014;156024;156039;1560...,"[155980, 155981, 155992, 156014, 156024, 15603...","[43, 47, 1141, 263, 233, 133, 143, 116, 798, 7...","[1f-d, 1f-c, b1_only, 2f-e, 2f-a, 2f-b, 2f-d, ...",1486187329,4488,16,14,34


### 3. Revisit Prediction (Binary classification)

In [10]:
import time
import xgboost as xgb
from sklearn import metrics
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

def show_intention_classification_result(y_pred, y_test):
    return metrics.accuracy_score(y_test, y_pred)

def show_interval_regression_result(y_pred, y_test):
    return metrics.mean_squared_error(y_test, y_pred)

Downsampling for measuring binary classification accuracy

In [11]:
df_train = pd.concat([train_visits, train_labels[['revisit_intention','revisit_interval']]], axis=1)
df_test = pd.concat([test_visits, test_labels[['revisit_intention','revisit_interval']]], axis=1)

In [12]:
## Generate 'suppress_time' column for evaluation
def generate_suppress_time_col(df):
    last_ts_end = max(df['ts_end'])
    df['tmp_suppress_time'] = [(last_ts_end-x)/86400 for x in df['ts_end']]
    df['suppress_time'] = np.maximum(df['revisit_interval'].fillna(0), df['revisit_interval'].isnull()*df['tmp_suppress_time'])
    del df['tmp_suppress_time']
    return df
    
df_train = generate_suppress_time_col(df_train)
df_test = generate_suppress_time_col(df_test)

In [13]:
### Retain only feature values

def remove_unnecessary_features(df):
    unnecessary_attributes = ['visit_id', 'indices', 'l_index', 'dwell_times', 'areas', 'ts_end'] #'wifi_id', 
    all_attributes = list(df.columns)
    for attribute in unnecessary_attributes:
        try:
            all_attributes.remove(attribute)
        except:
            pass
    df = df[all_attributes]
    return df

df_train = remove_unnecessary_features(df_train)
df_test = remove_unnecessary_features(df_test)

In [14]:
def label_balancing(df, name_target_column):
    ## No downsampling
    return df
    
#     ## 1:1 Downsampling
#     minimum_label_num = list(df[name_target_column].value_counts())[-1]
    
#     df_list = []
#     for value in df[name_target_column].unique():
#         sub_dfs = df.loc[df[name_target_column] == value]
#         new_sub_dfs = sub_dfs.iloc[np.random.permutation(len(sub_dfs))][:minimum_label_num]  ## Random Downsampling according to smallest label size
#         df_list.append(new_sub_dfs)
#         del sub_dfs
        
#     new_df = pd.concat(df_list).sort_index()
    
#     return new_df

In [15]:
df_train.head(3)

Unnamed: 0,wifi_id,date,total_dwell_time,num_area,num_unique_area,date_rel,revisit_intention,revisit_interval,suppress_time
0,4,17173,310,3,3,6,0,,174.071262
1,7,17323,1039,6,4,156,0,,24.073588
2,8,17201,4488,16,14,34,0,,146.288704


In [16]:
1-sum(df_train.revisit_intention)/len(df_train.revisit_intention), 1-sum(df_test.revisit_intention)/len(df_test.revisit_intention)

(0.7957084589466217, 0.7960771331622941)

In [17]:
df_test['suppress_time'].head(5)

0     11.268449
1    110.364028
2     18.660000
3     15.050000
4      4.890000
Name: suppress_time, dtype: float64

In [18]:
import lifelines

print('-----------   Experiments Begin   -------------')
print()

clfs = [Pipeline([('classification', XGBClassifier(max_depth=5, learning_rate=0.1))]), 
                Pipeline([('regression', XGBRegressor(max_depth=5, learning_rate=0.1))]),
               ]

options = ['no_date', 'date']

rslt = {}
for opt in options:
    rslt[opt] = {}
    for clf in clfs:
        rslt[opt][clf] = {}
        rslt[opt][clf]['elapsed_time'] = []
        for metric in ['accuracy', 'precision_macro', 'recall_macro', 'fscore_macro', 'MSE', 'cindex']:
            rslt[opt][clf][metric] = []


## 나중에 더 정교화 (실제 cut-off point 가지고)
train_date_max = max(df_train.date)
test_date_max = max(df_test.date)
    
for i in range(2):
    for opt in ['no_date', 'date']:
        ## Making downsampled dataset for measuring binary classification accuracy - baseline = 0.5
        if opt == 'no_date':
            whole_balanced_train = label_balancing(df_train, 'revisit_intention')[df_train.columns.drop(['date','date_rel'])]
            whole_balanced_test = label_balancing(df_test, 'revisit_intention')[df_train.columns.drop(['date','date_rel'])]   
            
        if opt == 'date':
            whole_balanced_train = label_balancing(df_train, 'revisit_intention') 
            whole_balanced_test = label_balancing(df_test, 'revisit_intention')
    
        if (i == 0) and (opt == 'no_date'):
            print('Class label distribution after downsampling - Train data: revisit_intention 0: {}, 1: {}'.format(
                whole_balanced_train.revisit_intention.value_counts()[0],
                whole_balanced_train.revisit_intention.value_counts()[1]))
            print('Class label distribution after downsampling - Test data: revisit_intention 0: {}, 1: {}'.format(
                whole_balanced_test.revisit_intention.value_counts()[0],
                whole_balanced_test.revisit_intention.value_counts()[1]))


        for (train_data, test_data, ref) in [(whole_balanced_train, whole_balanced_test, 'Downsampled')]:
            train_array = np.asarray(train_data)  
            test_array = np.asarray(test_data)  

            for clf in clfs:
                if clf.steps[0][0] == 'classification':

                    # Dividing features and labels
                    X_train, y_train = train_array[:, :-3], train_array[:, -3].astype(int)
                    X_test, y_test = test_array[:, :-3], test_array[:, -3].astype(int)

                    # Training
                    start = time.time()
                    clf = clf.fit(X_train, y_train)

                    # Prediction
                    y_pred = clf.predict(X_test)

                    acc = metrics.accuracy_score(y_test, y_pred)
                    pm,rm,fm,_ = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')

                    rslt[opt][clf]['accuracy'].append(acc)
                    rslt[opt][clf]['precision_macro'].append(pm)
                    rslt[opt][clf]['recall_macro'].append(rm)
                    rslt[opt][clf]['fscore_macro'].append(fm)

                else:
                    # Dividing features and labels
                    X_train, y_train = train_array[:, :-3], train_array[:, -1]
                    X_test, y_test = test_array[:, :-3], test_array[:, -1]
                    y_test_bin = test_array[:,-3]
                    

                    # Training
                    start = time.time()
                    clf = clf.fit(X_train, y_train)

                    # Prediction
                    y_pred = clf.predict(X_test)
#                     mse = metrics.mean_squared_error(y_test, y_pred)

#                     y_test_max = max(y_test)
                    mse = metrics.mean_squared_error(y_test[y_test_bin == 1], y_pred[y_test_bin == 1])
                    
                    cindex = lifelines.utils.concordance_index(y_test, y_pred, event_observed= y_test_bin == 1)

                    rslt[opt][clf]['MSE'].append(mse)
                    rslt[opt][clf]['cindex'].append(cindex)


                done = time.time()
                elapsed = done-start
                rslt[opt][clf]['elapsed_time'].append(elapsed)
                
                if opt == 'no_date':
                    no_date_pred = y_pred
                    no_date_test = y_test
                if opt == 'date':
                    date_pred = y_pred
                    date_test = y_test
                    
                
                

print()
print('-----------   Performance of our model   -------------')
print()

    
for opt in options:
    print('Option: {}'.format(opt))
    for clf in clfs:
        for key in rslt[opt][clf].keys():
            if len(rslt[opt][clf][key]) > 0:
                print('Average {} (with {}): {:.4f}'.format(key, str(clf.steps[0][1])[:10], np.mean(rslt[opt][clf][key])))
    print()


-----------   Experiments Begin   -------------

Class label distribution after downsampling - Train data: revisit_intention 0: 31409, 1: 8064
Class label distribution after downsampling - Test data: revisit_intention 0: 19238, 1: 4928


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



-----------   Performance of our model   -------------

Option: no_date
Average elapsed_time (with XGBClassif): 1.9959
Average accuracy (with XGBClassif): 0.7960
Average precision_macro (with XGBClassif): 0.5647
Average recall_macro (with XGBClassif): 0.5001
Average fscore_macro (with XGBClassif): 0.4438
Average elapsed_time (with XGBRegress): 1.9219
Average MSE (with XGBRegress): 3139.9780
Average cindex (with XGBRegress): 0.6216

Option: date
Average elapsed_time (with XGBClassif): 2.1140
Average accuracy (with XGBClassif): 0.7961
Average precision_macro (with XGBClassif): 0.3980
Average recall_macro (with XGBClassif): 0.5000
Average fscore_macro (with XGBClassif): 0.4432
Average elapsed_time (with XGBRegress): 2.3320
Average MSE (with XGBRegress): 3741.0004
Average cindex (with XGBRegress): 0.5756



In [19]:
### 너무 안나오는 거 같아서 직접 찍어봄
print(np.array([no_date_pred, date_pred, date_test, y_test_bin]).T[:10])
print()
### no_date_pred의 sample 데이터의 concordance index
num_data_check = 5000 # len(y_test_bin)
print(lifelines.utils.concordance_index(date_test[:num_data_check], no_date_pred[:num_data_check], event_observed= y_test_bin[:num_data_check] == 1))
### date_pred의 sample 데이터의 concordance index
print(lifelines.utils.concordance_index(date_test[:num_data_check], date_pred[:num_data_check], event_observed= y_test_bin[:num_data_check] == 1))

[[1.17709457e+02 1.27671843e+01 1.12684491e+01 0.00000000e+00]
 [1.23332581e+02 1.43144484e+01 1.10364028e+02 0.00000000e+00]
 [9.87283173e+01 1.20967216e+01 1.86600000e+01 1.00000000e+00]
 [9.11858521e+01 1.26165447e+01 1.50500000e+01 1.00000000e+00]
 [7.03873825e+01 1.05440927e+00 4.89000000e+00 1.00000000e+00]
 [9.86553802e+01 4.26080823e-03 9.82105671e+01 0.00000000e+00]
 [8.15761490e+01 3.48660111e+00 5.72840625e+01 0.00000000e+00]
 [7.03582916e+01 3.74392486e+00 1.19322917e+02 0.00000000e+00]
 [6.01868744e+01 2.83416104e+00 1.43226296e+02 0.00000000e+00]
 [6.01307602e+01 3.86750865e+00 1.11130810e+02 0.00000000e+00]]

0.6127244131277473
0.5872734374160369


In [60]:
whole_balanced_train.tail(3)

Unnamed: 0,wifi_id,date,total_dwell_time,num_area,num_unique_area,date_rel,revisit_intention,revisit_interval,suppress_time
42685,99995,17190,5922,9,9,23,0,,157.257627
42686,99996,17236,4601,9,9,69,0,,111.211296
42687,99999,17250,1294,8,8,83,0,,97.223218


In [109]:
from lifelines import CoxPHFitter

# Using Cox Proportional Hazards model
cph = CoxPHFitter()
# `event_col` refers to whether the 'death' events was observed: 1 if observed, 0 else (censored)
cph.fit(whole_balanced_train[whole_balanced_train.columns.drop(['revisit_interval','wifi_id'])], 
        'suppress_time', event_col='revisit_intention', show_progress=False)
cph.print_summary()

n=42688, number of events=17937

                    coef  exp(coef)  se(coef)        z      p  lower 0.95  upper 0.95     
date              0.0469     1.0481 4214.4548   0.0000 1.0000  -8260.1327   8260.2266     
total_dwell_time -0.0000     1.0000    0.0000  -1.9407 0.0523     -0.0000      0.0000    .
num_area          0.0022     1.0022    0.0050   0.4425 0.6581     -0.0075      0.0119     
num_unique_area  -0.0604     0.9414    0.0056 -10.8235 0.0000     -0.0713     -0.0495  ***
date_rel         -0.0390     0.9617 4214.4548  -0.0000 1.0000  -8260.2187   8260.1406     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Concordance = 0.604
Likelihood ratio test = 4121.607 on 5 df, p=0.00000


In [110]:
test_pred = cph.predict_expectation(whole_balanced_test[whole_balanced_test.columns.drop(['revisit_interval','revisit_intention','suppress_time','wifi_id'])])
score_test = lifelines.utils.concordance_index(whole_balanced_test['suppress_time'], test_pred[0], event_observed=whole_balanced_test['revisit_intention'])
score_test

0.5490959260786329

In [111]:
test_pred = cph.predict_median(whole_balanced_test[whole_balanced_test.columns.drop(['revisit_interval','revisit_intention','suppress_time','wifi_id'])])
score_test = lifelines.utils.concordance_index(whole_balanced_test['suppress_time'], test_pred[0.5], event_observed=whole_balanced_test['revisit_intention'])
score_test

0.5490977443069198

In [112]:
pd.concat([whole_balanced_test['suppress_time'], test_pred[0.5], whole_balanced_test['revisit_intention']],axis=1)

Unnamed: 0,suppress_time,0.5,revisit_intention
0,137.344502,34.21,0
1,8.020000,48.07,1
2,0.710000,13.07,1
3,137.344722,48.16,0
4,176.123993,58.06,0
5,24.048727,11.79,0
6,26.060000,8.73,1
7,10.920000,34.05,1
8,1.190000,71.84,1
9,108.090000,58.00,1


In [113]:
## C-index test

lifelines.utils.concordance_index(np.array([1,2,3,4,5,6,7]), np.array([4,2,3,5,1,6,7]), event_observed=np.array([1,0,1,0,1,0,1]))

0.6666666666666666