In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the preprocessed dataframes
train_raw_df = pd.read_csv("preprocessed_train_v2(3).csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("preprocessed_test_v2(3).csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 37), (401589, 37))

In [3]:
train_raw_df.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,...,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,3,2,3162355547410993243,1,1508198450,6,0,False,5,106,...,1,True,0.0,4,13,False,11,3,58,9
1,5,3,8934116514970143966,6,1508176307,3,0,False,1,24,...,1,True,0.0,4,13,False,12,4,57,21
2,1,3,7992466427990357681,1,1508201613,3,1,True,0,106,...,1,True,0.0,4,13,True,12,5,58,0
3,3,2,9075655783635761930,1,1508169851,3,0,False,5,106,...,1,True,0.0,4,13,False,12,3,58,9
4,3,1,6960673291025684308,1,1508190552,3,0,False,5,106,...,1,True,0.0,4,13,False,12,3,58,9


### Convert visitStartTime to different data features

In [4]:
def expand_visit_start_time(df):
    df['full_date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['date'] = df['full_date'].dt.date
    df['day_of_week'] = df['full_date'].dt.dayofweek
    df['hour_of_day'] = df['full_date'].dt.hour
    df['day_of_month'] = df['full_date'].dt.day
    df['week_of_year'] = df['full_date'].dt.weekofyear
    
    # Drop date column
    df.drop(['full_date'], axis=1, inplace=True)
    df.drop(['visitStartTime'], axis=1, inplace=True)
    return df

#### add here new features

In [5]:
def apply_feature_engineering(df):
    
    df = expand_visit_start_time(df)
     
    return df

### Checkpoint

In [6]:
train_df = apply_feature_engineering(train_raw_df)
test_df = apply_feature_engineering(test_raw_df)
train_df.shape, test_df.shape

((1708337, 41), (401589, 41))

In [7]:
train_df.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,...,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,date,day_of_week,hour_of_day,day_of_month,week_of_year
0,3,2,3162355547410993243,1,6,0,False,5,106,3,...,False,11,3,58,9,2017-10-17,1,0,17,42
1,5,3,8934116514970143966,6,3,0,False,1,24,1,...,False,12,4,57,21,2017-10-16,0,17,16,42
2,1,3,7992466427990357681,1,3,1,True,0,106,1,...,True,12,5,58,0,2017-10-17,1,0,17,42
3,3,2,9075655783635761930,1,3,0,False,5,106,2,...,False,12,3,58,9,2017-10-16,0,16,16,42
4,3,1,6960673291025684308,1,3,0,False,5,106,1,...,False,12,3,58,9,2017-10-16,0,21,16,42


In [None]:
"""test_start_date = min(test_df.date)
test_end_date = max(test_df.date)

test_label_start_date = pd.to_datetime("2018-12-01").date()
test_label_end_date = pd.to_datetime("2019-01-31").date()

missing_days = (test_label_start_date - test_end_date).days
prediction_span = (test_label_end_date - test_label_start_date).days + 1

print("There is a gap of %d days between the last day of our dataset and the prediction" % missing_days)
print("We have to predict %d days after this gap" % prediction_span)"""

In [8]:
train_start = pd.to_datetime("2017-05-01").date()
train_end = pd.to_datetime("2017-10-16").date()

train_label_start = pd.to_datetime("2017-12-01").date()
train_label_end = pd.to_datetime("2018-01-31").date()

In [9]:
labeled_train = train_df[(train_df['date'].values <= train_end) & (train_df['date'].values >= train_start)]
temp_train_label = train_df[(train_df['date'].values <= train_label_end) & (train_df['date'].values >= train_label_start)]
labeled_train.shape, temp_train_label.shape

((427826, 41), (180572, 41))

In [10]:
# Create a dataframe containing only couples of fullvisitorid and totaltransactionrevenue when the latter is not null
def zip_df_on_revenue(df):
    fullID = []
    not_null_transactions = []
    for row in df.itertuples():
        if(row.totals_transactionRevenue > 0):
            fullID.append(row.fullVisitorId)
            not_null_transactions.append(row.totals_transactionRevenue)
    temp_df = pd.DataFrame({'fullVisitorId': fullID})
    not_null_transactions_temp_df = pd.DataFrame({'totals_transactionRevenue': not_null_transactions})
    temp_df = temp_df.join(not_null_transactions_temp_df)
    return temp_df

In [11]:
zipped_label = zip_df_on_revenue(temp_train_label)
zipped_label.shape

(1405, 2)

In [12]:
%%time
label = []

for user in labeled_train['fullVisitorId'].values:
    temp = 0
    if(user in zipped_label.fullVisitorId.values):
        for record in zipped_label.itertuples():
            if (record.fullVisitorId == user):
                temp += record.totals_transactionRevenue
    label.append(temp)

Wall time: 21.3 s


In [None]:
"""%%time
# Working but slower version of the previous algorithm
label_old = []

for user in labeled_train['fullVisitorId']:
    temp = 0
    for record in zipped_label.itertuples():
        if (record.fullVisitorId == user):
            temp += record.totals_transactionRevenue
    label_old.append(temp)
    
if(label_old == label):
    print("The two algorithms are equivalent")"""

In [13]:
labeled_train = labeled_train.reset_index(drop=True) # Important!

temp_label = pd.DataFrame({'label': label})
temp_label = temp_label.reset_index(drop=True) # Important!

labeled_train = labeled_train.join(temp_label)

labeled_train.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,...,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,date,day_of_week,hour_of_day,day_of_month,week_of_year,label
0,5,3,8934116514970143966,6,3,0,False,1,24,1,...,12,4,57,21,2017-10-16,0,17,16,42,15990000.0
1,3,2,9075655783635761930,1,3,0,False,5,106,2,...,12,3,58,9,2017-10-16,0,16,16,42,0.0
2,3,1,6960673291025684308,1,3,0,False,5,106,1,...,12,3,58,9,2017-10-16,0,21,16,42,0.0
3,5,3,166277907528479249,1,3,0,False,3,86,1,...,12,5,21,0,2017-10-16,0,23,16,42,0.0
4,5,2,8349655975937271469,1,3,0,False,3,52,3,...,12,4,57,21,2017-10-16,0,11,16,42,0.0


In [14]:
labeled_train

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,...,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,date,day_of_week,hour_of_day,day_of_month,week_of_year,label
0,5,3,8934116514970143966,6,3,0,False,1,24,1,...,12,4,57,21,2017-10-16,0,17,16,42,15990000.0
1,3,2,9075655783635761930,1,3,0,False,5,106,2,...,12,3,58,9,2017-10-16,0,16,16,42,0.0
2,3,1,6960673291025684308,1,3,0,False,5,106,1,...,12,3,58,9,2017-10-16,0,21,16,42,0.0
3,5,3,0166277907528479249,1,3,0,False,3,86,1,...,12,5,21,0,2017-10-16,0,23,16,42,0.0
4,5,2,8349655975937271469,1,3,0,False,3,52,3,...,12,4,57,21,2017-10-16,0,11,16,42,0.0
5,3,2,1259490915281096752,2,10,1,True,7,106,3,...,12,3,58,9,2017-10-16,0,14,16,42,0.0
6,3,2,7390444353235629134,1,3,0,False,1,106,3,...,12,3,58,9,2017-10-16,0,12,16,42,0.0
7,3,2,1297236773919696722,1,10,0,False,3,106,3,...,12,3,58,9,2017-10-16,0,13,16,42,0.0
8,3,3,5983987899933814948,1,10,1,True,7,106,1,...,12,3,58,9,2017-10-16,0,18,16,42,0.0
9,4,3,6135613929977117121,10,10,1,True,7,106,1,...,0,1,58,9,2017-10-16,0,15,16,42,0.0


In [19]:
for i in labeled_train.itertuples():
    if i.fullVisitorId == '8934116514970143966':
        print(i.label)

15990000.0
15990000.0
15990000.0
15990000.0
15990000.0


### Data preparation for catboost

In [None]:
labeled_train.columns

In [None]:
cont_features = ['totals_hits',
                'totals_timeOnSite',
                'totals_pageviews', 
                'visitNumber',
                'totals_totalTransactionRevenue',
                'totals_transactionRevenue',
                'totals_transactions',
                'totals_sessionQualityDim']

cat_features = ['fullVisitorId',
                'customDimensions',
                'visitId', #
                'channelGrouping', 
                'device_browser', 
                'device_deviceCategory',
                'device_operatingSystem', 
                'geoNetwork_continent',
                'geoNetwork_subContinent', 
                'trafficSource_medium', 
                'geoNetwork_country', 
                'geoNetwork_metro', 
                'geoNetwork_city', 
                'geoNetwork_region',
                'geoNetwork_networkDomain',
                'trafficSource_source', 
                'day_of_week', 
                'hour_of_day', 
                'day_of_month',
                'week_of_year', 
                'trafficSource_adContent', 
                'trafficSource_adwordsClickInfo.adNetworkType', 
                'trafficSource_adwordsClickInfo.gclId',
                'trafficSource_adwordsClickInfo.page', 
                'trafficSource_adwordsClickInfo.slot', 
                'trafficSource_keyword', 
                'trafficSource_referralPath',
                'trafficSource_campaign',
                #ex booleans
                'device_isMobile', #
                'trafficSource_isTrueDirect', #
                'trafficSource_adwordsClickInfo.isVideoAd', #
                'totals_bounces', #
                'totals_newVisits' #
               ]

In [None]:
train = labeled_train[cont_features + cat_features]
train_y = labeled_train['label'].values
log_y = np.log1p(train_y)

In [None]:
#check if we took all the features except totals_totalTransactionRevenue (yep!)
for c in labeled_train.columns:
    if c not in train.columns:
        print(c)

### Feature ranking with catboost

In [None]:
#generate categorical column indices dinamically
cat_positions = []
for i in cat_features:
    cat_positions.append(train.columns.get_loc(i))
# cat_positions

In [None]:
%%time
from catboost import CatBoostRegressor
from catboost import Pool

# necessary for feature importance ranking
pool = Pool(train, log_y, cat_features = cat_positions)


#fit catboost regressor
cat = CatBoostRegressor(
    random_seed = seed,
    loss_function = 'RMSE',
    eval_metric = 'RMSE',
    logging_level='Silent',
    task_type = 'GPU'
)


cat.fit(pool)

### Inspect features inside the model

In [None]:
# get features importance ranking
# the individual importance values for each of the input features
features = train.columns
importances = np.asarray(cat.get_feature_importance(pool, fstr_type='FeatureImportance'))
indices = np.argsort(importances)# top features
plt.figure(figsize=(24, 12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
import shap

# load JS visualization code to notebook
shap.initjs()
shap_values = np.asarray(cat.get_feature_importance(pool, fstr_type='ShapValues'))

In [None]:
# summarize the effects of all the features
shap.summary_plot(
    shap_values[:,:pool.shape[1]], 
    features = train, 
    feature_names = train.columns,
    max_display = 100
)

In [None]:
shap_values_lol = shap.TreeExplainer(cat).shap_values(pool)

In [None]:
if(shap_values == shap_values_lol):
    print("ciao")

In [None]:
shap.dependence_plot('fullVisitorId', shap_values_lol, train)

In [None]:
explainer = shap.TreeExplainer(cat)
print(explainer.expected_value)

In [None]:
shap_values.shape

In [None]:
train.shape

In [None]:
# visualize the first prediction's explanation
shap.force_plot(0, shap_values[0,:pool.shape[1]], train.iloc[0,:])

In [None]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[:,:pool.shape[1]], train)

### Export Dataframes

In [None]:
labeled_train.to_csv('featured_train_v2(4).csv', index = False)
test_df.to_csv('featured_test_v2(4).csv', index = False)