In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the preprocessed dataframes
train_raw_df = pd.read_csv("preprocessed_train_v2(3).csv",
    dtype={'fullVisitorId': str}, nrows=None)
test_raw_df = pd.read_csv("preprocessed_test_v2(3).csv",
    dtype={'fullVisitorId': str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((1708337, 38), (401589, 38))

In [None]:
train_raw_df.head()

### Convert visitStartTime to different data features

In [3]:
def expand_visit_start_time(df):
    df['full_date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['date'] = df['full_date'].dt.date
    df['day_of_week'] = df['full_date'].dt.dayofweek
    df['hour_of_day'] = df['full_date'].dt.hour
    df['day_of_month'] = df['full_date'].dt.day
    df['week_of_year'] = df['full_date'].dt.weekofyear
    
    # Drop date column
    df.drop(['full_date'], axis=1, inplace=True)
    return df

##### add here new features

In [4]:
def apply_feature_engineering(df):
    
    df = expand_visit_start_time(df)
     
    return df

### Checkpoint

In [5]:
train_df = apply_feature_engineering(train_raw_df)
test_df = apply_feature_engineering(test_raw_df)
train_df.shape, test_df.shape

((1708337, 43), (401589, 43))

In [6]:
train_df.head()

Unnamed: 0,channelGrouping,customDimensions,fullVisitorId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,...,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,date,day_of_week,hour_of_day,day_of_month,week_of_year
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",3162355547410993243,1508198450,1,1508198450,Firefox,desktop,False,Windows,...,False,water bottle,organic,unknown,google,2017-10-17,1,0,17,42
1,Referral,"[{'index': '4', 'value': 'North America'}]",8934116514970143966,1508176307,6,1508176307,Chrome,desktop,False,Chrome OS,...,False,unknown,referral,/a/google.com/transportation/mtv-services/bike...,sites.google.com,2017-10-16,0,17,16,42
2,Direct,"[{'index': '4', 'value': 'North America'}]",7992466427990357681,1508201613,1,1508201613,Chrome,mobile,True,Android,...,True,unknown,unknown,unknown,(direct),2017-10-17,1,0,17,42
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",9075655783635761930,1508169851,1,1508169851,Chrome,desktop,False,Windows,...,False,unknown,organic,unknown,google,2017-10-16,0,16,16,42
4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",6960673291025684308,1508190552,1,1508190552,Chrome,desktop,False,Windows,...,False,unknown,organic,unknown,google,2017-10-16,0,21,16,42


In [54]:
"""test_start_date = min(test_df.date)
test_end_date = max(test_df.date)

test_label_start_date = pd.to_datetime("2018-12-01").date()
test_label_end_date = pd.to_datetime("2019-01-31").date()

missing_days = (test_label_start_date - test_end_date).days
prediction_span = (test_label_end_date - test_label_start_date).days + 1

print("There is a gap of %d days between the last day of our dataset and the prediction" % missing_days)
print("We have to predict %d days after this gap" % prediction_span)"""

There is a gap of 46 days between the last day of our dataset and the prediction
We have to predict 62 days after this gap


In [8]:
train_start = pd.to_datetime("2017-05-01").date()
train_end = pd.to_datetime("2017-10-16").date()

train_label_start = pd.to_datetime("2017-12-01").date()
train_label_end = pd.to_datetime("2018-01-31").date()

In [13]:
temp_train = train_df[(train_df['date'].values <= train_end) & (train_df['date'].values >= train_start)]
temp_train_label = train_df[(train_df['date'].values <= train_label_end) & (train_df['date'].values >= train_label_start)]
temp_train.shape, temp_train_label.shape

((427826, 43), (180572, 43))

In [14]:
# Create a dataframe containing only couples of fullvisitorid and totaltransactionrevenue when the latter is not null
def zip_df_on_revenue(df):
    fullID = []
    not_null_transactions = []
    for row in df.itertuples():
        if(row.totals_totalTransactionRevenue > 0):
            fullID.append(row.fullVisitorId)
            not_null_transactions.append(row.totals_totalTransactionRevenue)
    temp_df = pd.DataFrame({'fullVisitorId': fullID})
    not_null_transactions_temp_df = pd.DataFrame({'totals_totalTransactionRevenue': not_null_transactions})
    temp_df = temp_df.join(not_null_transactions_temp_df)
    return temp_df

In [15]:
zipped_label = zip_df_on_revenue(temp_train_label)
zipped_label.shape

(1405, 2)

In [18]:
%%time
label = []

for user in temp_train['fullVisitorId']:
    temp = 0
    for record in zipped_label.itertuples():
        if (record.fullVisitorId == user):
            temp += record.totals_totalTransactionRevenue
    label.append(temp)
label

[22990000.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 640000000.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 22237700

In [20]:
len(label)

427826

In [21]:
temp_label = pd.DataFrame({'label': label})
temp_train = temp_train.join(temp_label)

In [22]:
temp_train.isnull().sum()

channelGrouping                                      0
customDimensions                                     0
fullVisitorId                                        0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device_browser                                       0
device_deviceCategory                                0
device_isMobile                                      0
device_operatingSystem                               0
geoNetwork_city                                      0
geoNetwork_continent                                 0
geoNetwork_country                                   0
geoNetwork_metro                                     0
geoNetwork_networkDomain                             0
geoNetwork_region                                    0
geoNetwork_subContinent                              0
totals_bounces                                       0
totals_hit

### Data preparation for catboost

In [7]:
train_df.columns

Index(['channelGrouping', 'customDimensions', 'fullVisitorId', 'visitId',
       'visitNumber', 'visitStartTime', 'device_browser',
       'device_deviceCategory', 'device_isMobile', 'device_operatingSystem',
       'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country',
       'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region',
       'geoNetwork_subContinent', 'totals_bounces', 'totals_hits',
       'totals_newVisits', 'totals_pageviews', 'totals_sessionQualityDim',
       'totals_timeOnSite', 'totals_totalTransactionRevenue',
       'totals_transactionRevenue', 'totals_transactions',
       'trafficSource_adContent',
       'trafficSource_adwordsClickInfo.adNetworkType',
       'trafficSource_adwordsClickInfo.gclId',
       'trafficSource_adwordsClickInfo.isVideoAd',
       'trafficSource_adwordsClickInfo.page',
       'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign',
       'trafficSource_isTrueDirect', 'trafficSource_keyword',
       'traff

In [14]:
cont_features = ['totals_hits', 'totals_pageviews', 'visitNumber', 'visitStartTime']

bool_features = []

cat_features = ['fullVisitorId',
                'customDimensions',
                'visitId',
                'channelGrouping', 
                'device_browser', 
                'device_deviceCategory',
                'device_operatingSystem', 
                'geoNetwork_continent',
                'geoNetwork_subContinent', 
                'trafficSource_medium', 
                'geoNetwork_country', 
                'geoNetwork_metro', 
                'geoNetwork_city', 
                'geoNetwork_region',
                'geoNetwork_networkDomain',
                'trafficSource_source', 
                'day_of_week', 
                'hour_of_day', 
                'day_of_month',
                'week_of_year', 
                'trafficSource_adContent', 
                'trafficSource_adwordsClickInfo.adNetworkType', 
                'trafficSource_adwordsClickInfo.gclId',
                'trafficSource_adwordsClickInfo.page', 
                'trafficSource_adwordsClickInfo.slot', 
                'trafficSource_keyword', 
                'trafficSource_referralPath',
                'trafficSource_campaign',
                #ex booleans
                'device_isMobile', 
                'trafficSource_isTrueDirect', 
                'trafficSource_adwordsClickInfo.isVideoAd', 
                'totals_bounces', 
                'totals_newVisits',
                'totals_sessionQualityDim',
                'totals_timeOnSite',
                'totals_totalTransactionRevenue',
                'totals_transactionRevenue',
                'totals_transactions'

               ]

In [15]:
train = train_df[cont_features + bool_features + cat_features]
test = test_df[cont_features + bool_features + cat_features]

train_y = train_df['totals_transactionRevenue'].values
log_y = np.log1p(train_y)

In [16]:
#check if we took all the features except totals.transactionRevenue (yep!)
for c in train_df.columns:
    if c not in train.columns:
        print(c)

In [85]:
from sklearn.model_selection import train_test_split
train = train_df.drop('totals.transactionRevenue',axis=1)

X_train, X_val, y_train, y_val = train_test_split(train, log_y, train_size=0.85, random_state=seed)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((768105, 36), (135548, 36), (768105,), (135548,))

### Feature ranking with catboost

In [21]:
#generate categorical column indices dinamically
cat_positions = []
for i in cat_features:
    cat_positions.append(train.columns.get_loc(i))
cat_positions
cat_positions

[4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35]

In [14]:
cat_positions

<function list.sort(*, key=None, reverse=False)>

In [None]:
from catboost import CatBoostRegressor
from catboost import Pool 

#necessary for feature importance ranking
#pool = Pool(X_train, y_train)


#fit catboost regressor
cat = CatBoostRegressor(
    random_seed = seed,
    loss_function = 'RMSE',
    eval_metric = 'RMSE',
    logging_level='Silent',
    task_type = 'GPU'
)

cat.fit(train, log_y,
        cat_features = cat_positions
        #eval_set = (X_val, y_val),
        #plot = True
       )

In [13]:
from catboost import CatBoostRegressor

cat_positions =[]

#fit catboost regressor
cat = CatBoostRegressor(
    random_seed = seed,
    loss_function = 'RMSE',
    eval_metric = 'RMSE',
    logging_level='Silent',
    task_type = 'GPU'
)

cat.fit(X, y,
        cat_features = cat_positions
       )

TypeError: Cannot convert 'b'organic search'' to float

In [None]:
#get importance ranking
features = X.columns
importances = np.asarray(cat.get_feature_importance(pool, fstr_type='FeatureImportance'))
indices = np.argsort(importances)# top features
plt.figure(figsize=(24, 12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Export Dataframes

In [7]:
train_df.to_csv('featured_train(4).csv', index = False)
test_df.to_csv('featured_test(4).csv', index = False)