# Import Packages

In [6]:
import pandas as pd
import numpy as np
import math
import yaml


from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Load Data

In [3]:
%%time
# The data saved from Data_Preprocessing.ipynb
num = 600000
df = pd.read_csv('combi.csv', nrows = num)
df1 = pd.read_csv('combi.csv', nrows = num, skiprows = range(1, num+1))
df2 = pd.read_csv('combi.csv', nrows = num, skiprows = range(1, 2*num+1))
df3 = pd.read_csv('combi.csv', nrows = num, skiprows = range(1, 3*num+1))

CPU times: user 46.5 s, sys: 4.68 s, total: 51.2 s
Wall time: 54.9 s


In [4]:
%%time
train_cp = pd.read_csv('train_nohits.csv')
test_cp = pd.read_csv('test_nohits.csv')

CPU times: user 34.2 s, sys: 5.13 s, total: 39.3 s
Wall time: 42.7 s


In [5]:
combi = pd.concat([df, df1, df2, df3], ignore_index=True) 

### Define data summary function

In [6]:
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['Null','Unique_Count','Data_type','Max/Min','Mean','Std','Skewness','Sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'Max/Min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'Mean']=df_fa[col].mean()
            df.at[col,'Std']=df_fa[col].std()
            df.at[col,'Skewness']=df_fa[col].skew()
        df.at[col,'Sample_values']=list(df_fa[col].unique())
           
    return(df.fillna('-'))

### Found one column still contains json

In [7]:
def str_json(x):
    return(yaml.load(x))

In [8]:
%%time
adwordsClickinfo = combi['ts_adwordsClickInfo'].apply(str_json)

CPU times: user 10min 38s, sys: 10.5 s, total: 10min 49s
Wall time: 11min 26s


In [9]:
adwordsClickinfo = pd.DataFrame(adwordsClickinfo.tolist())

In [10]:
adwords_colnames = []
for col in adwordsClickinfo.columns:
    adwords_colnames += ['ts_ad_' + col]
adwordsClickinfo.columns = adwords_colnames

In [11]:
adwordsClickinfo.columns

Index(['ts_ad_adNetworkType', 'ts_ad_criteriaParameters', 'ts_ad_gclId',
       'ts_ad_isVideoAd', 'ts_ad_page', 'ts_ad_slot',
       'ts_ad_targetingCriteria'],
      dtype='object')

In [12]:
combi.drop('ts_adwordsClickInfo', axis=1, inplace=True)

In [13]:
combi = pd.concat([combi, adwordsClickinfo], axis=1)

In [14]:
combi.drop('ts_ad_targetingCriteria', axis=1, inplace=True) # still containing dicts

In [15]:
%%time
# feature summary
feature_summary(combi)

DataFrame shape
rows: 2109926
cols: 60
CPU times: user 14.7 s, sys: 2.5 s, total: 17.2 s
Wall time: 17.7 s


Unnamed: 0,Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
channelGrouping,0,8,object,-,-,-,-,"[Organic Search, Referral, Direct, Paid Search..."
date,0,806,int64,20181015/20160801,2.01722e+07,7160.75,-0.224556,"[20171016, 20160902, 20171130, 20170126, 20170..."
fullVisitorId,0,1671210,object,-,-,-,-,"[3162355547410993243, 8934116514970143966, 799..."
socialEngagementType,0,1,object,-,-,-,-,[Not Socially Engaged]
visitId,0,2058981,int64,1539673166/1470034812,1.50478e+09,1.98273e+07,-0.0583215,"[1508198450, 1508176307, 1508201613, 150816985..."
visitNumber,0,519,int64,523/1,2.3639,9.62472,22.9651,"[1, 6, 2, 10, 3, 4, 14, 11, 7, 15, 17, 5, 13, ..."
visitStartTime,0,2061066,int64,1539673166/1470034812,1.50478e+09,1.98273e+07,-0.0583215,"[1508198450, 1508176307, 1508201613, 150816985..."
cD_index,393816,2,float64,4.0/4.0,4,0,0,"[4.0, nan]"
cD_value,393816,6,object,-,-,-,-,"[EMEA, North America, Central America, nan, AP..."
d_browser,0,161,object,-,-,-,-,"[Firefox, Chrome, Safari, UC Browser, Internet..."


# Remove features with high percentage of NA

In [56]:
# find null rate > threshold columns
def find_null_columns(pd_df, threshold):
    columns = pd_df.columns
    n = pd_df.shape[0]
    null_column = [c for c in columns if pd_df[c].isnull().sum() > threshold * n]
    return null_column

In [59]:
null_column = find_null_columns(combi, 0.90)

In [60]:
null_column

['ts_campaignCode',
 't_totalTransactionRevenue',
 't_transactionRevenue',
 't_transactions',
 'ts_ad_adNetworkType',
 'ts_ad_gclId',
 'ts_ad_isVideoAd',
 'ts_ad_page',
 'ts_ad_slot']

In [74]:
# nan to np.nan
combi.replace('nan', np.nan, inplace=True)

In [76]:
# with or without campaigncode
combi['ts_campaignCode'].replace({'11251kjhkvahf':1, np.nan:0}, inplace=True) 

In [78]:
# replace nan with other networktype
combi['ts_ad_adNetworkType'].replace({np.nan:'other_type'}, inplace=True)

In [81]:
# replace nan with True
combi['ts_ad_isVideoAd'].replace({np.nan:True}, inplace=True)

In [84]:
# replace nan with '0'
combi['ts_ad_page'].replace({np.nan:'0'}, inplace=True)
combi['ts_ad_page'] = combi['ts_ad_page'].apply(lambda x: int(x))

In [86]:
to_delete = ['ts_ad_gclId', 'ts_ad_slot']
combi.drop(to_delete, axis=1, inplace=True)

# Choose t_transactionRevenue as our respone

In [87]:
combi.drop('t_totalTransactionRevenue', axis=1, inplace=True) # double check
combi.drop('t_transactions', axis=1, inplace=True)

In [94]:
# t_transactionRevenue
combi['t_transactionRevenue'] = [0 if math.isnan(x) else x for x in combi['t_transactionRevenue']]

# Delete unique value columns

In [91]:
onevalue_column = []
for c in combi.columns:
    if len(combi[c].unique()) == 1:
        onevalue_column += [c]

In [92]:
onevalue_column

['socialEngagementType',
 'd_browserSize',
 'd_browserVersion',
 'd_flashVersion',
 'd_language',
 'd_mobileDeviceBranding',
 'd_mobileDeviceInfo',
 'd_mobileDeviceMarketingName',
 'd_mobileDeviceModel',
 'd_mobileInputSelector',
 'd_operatingSystemVersion',
 'd_screenColors',
 'd_screenResolution',
 'geo_cityId',
 'geo_latitude',
 'geo_longitude',
 'geo_networkLocation',
 't_visits',
 'ts_ad_criteriaParameters']

In [93]:
combi.drop(onevalue_column, axis=1, inplace=True)

In [97]:
combi_new2 = combi.copy()

# Impute Missing Data

In [98]:
combi_new2.isnull().sum()[combi_new2.isnull().sum() > 0]

cD_index                393816
cD_value                393816
ts_adContent           1643600
ts_isTrueDirect        1426999
ts_keyword             1093006
ts_referralPath        1142073
t_bounces              1055670
t_newVisits             516431
t_pageviews                340
t_sessionQualityDim     835274
t_timeOnSite           1057980
dtype: int64

In [101]:
# cD_index only have 4 and np.nan
combi_new2.drop('cD_index', axis=1, inplace=True)

In [103]:
# cD_value
combi_new2['cD_value'].replace({np.nan:'other'}, inplace=True)

In [105]:
# ts_adContent
combi_new2.drop('ts_adContent', axis=1, inplace=True)

In [107]:
# ts_isTrueDirect
combi_new2['ts_isTrueDirect'].fillna(False, inplace=True)

In [109]:
# ts_keyword
#combi_new2['ts_keyword'].unique()
combi_new2.drop('ts_keyword', axis=1, inplace=True)

In [136]:
# convert to with or without ts_referralPath
# don't know how to deal with
combi_new2.drop('ts_referralPath', axis=1, inplace=True)

In [147]:
# t_bounces
combi_new2.drop('t_bounces', axis=1, inplace=True)

In [161]:
# t_newVisits
combi_new2['t_newVisits'].replace({np.nan:0}, inplace=True)

In [157]:
# t_pageviews imputed with median
# ax = sns.boxplot(combi_new2['t_pageviews'])
combi_new2['t_pageviews'].fillna(combi_new2['t_pageviews'].median(),inplace=True)

In [159]:
# t_sessionQualityDim imputed with median
combi_new2['t_sessionQualityDim'].fillna(combi_new2['t_sessionQualityDim'].median(),inplace=True)

In [166]:
# t_timeOnSite imputed with median
combi_new2['t_timeOnSite'].fillna(combi_new2['t_timeOnSite'].median(), inplace=True)

# Feature Engineering

1 transaction status

In [169]:
combi_new2['revenue_status'] = combi_new2.t_transactionRevenue.apply(lambda x: 0 if x==0 else 1)

2 date

In [170]:
combi_new2['date'] = pd.to_datetime(combi_new2['date'], format='%Y%m%d')
combi_new2['month'] = combi_new2['date'].dt.month
combi_new2['weekday'] = combi_new2['date'].dt.dayofweek
combi_new2['day'] = combi_new2['date'].dt.day
# combi['hour'] = combi['date'].dt.hour # only got 1 value

In [182]:
part_month = []
for x in combi_new2['day']:
    if x < 11:
        part_month += ['b']
    elif (x >= 11) & (x < 21):
        part_month += ['m']
    else:
        part_month += ['e']

In [186]:
combi_new2['day'] = part_month

3 convert all strings to lowercase

In [187]:
#CONVERTING ALL THE STRINGS IN CATEGORICAL FEATURES TO LOWER CASE
for col in combi_new2.columns:
    if ((combi_new2[col].dtype == 'object') & (col != 'fullVisitorId')):
        combi_new2[col] = combi_new2[col].apply(lambda x: str(x).lower())

In [189]:
summary = feature_summary(combi_new2)

DataFrame shape
rows: 2109926
cols: 37


In [209]:
combi_new2.drop('test', axis=1, inplace=True) # drop log(y)

In [210]:
combi_new2.to_csv('Data_to_process.csv', index=False, header=True)

In [208]:
summary

Unnamed: 0,Null,Unique_Count,Data_type,Max/Min,Mean,Std,Skewness,Sample_values
channelGrouping,0,8,object,-,-,-,-,"[organic search, referral, direct, paid search..."
date,0,806,datetime64[ns],-,-,-,-,"[2017-10-16T00:00:00.000000000, 2016-09-02T00:..."
fullVisitorId,0,1671210,object,-,-,-,-,"[3162355547410993243, 8934116514970143966, 799..."
visitId,0,2058981,int64,1539673166/1470034812,1.50478e+09,1.98273e+07,-0.0583215,"[1508198450, 1508176307, 1508201613, 150816985..."
visitNumber,0,519,int64,523/1,2.3639,9.62472,22.9651,"[1, 6, 2, 10, 3, 4, 14, 11, 7, 15, 17, 5, 13, ..."
visitStartTime,0,2061066,int64,1539673166/1470034812,1.50478e+09,1.98273e+07,-0.0583215,"[1508198450, 1508176307, 1508201613, 150816985..."
cD_value,0,6,object,-,-,-,-,"[emea, north america, central america, other, ..."
d_browser,0,161,object,-,-,-,-,"[firefox, chrome, safari, uc browser, internet..."
d_deviceCategory,0,3,object,-,-,-,-,"[desktop, mobile, tablet]"
d_isMobile,0,2,bool,-,-,-,-,"[False, True]"


In [207]:
objects = summary.ix[summary['Data_type'] == 'object', 'Data_type'].index.tolist()
print(objects)

['channelGrouping', 'fullVisitorId', 'cD_value', 'd_browser', 'd_deviceCategory', 'd_operatingSystem', 'geo_city', 'geo_continent', 'geo_country', 'geo_metro', 'geo_networkDomain', 'geo_region', 'geo_subContinent', 'ts_campaign', 'ts_medium', 'ts_source', 'ts_ad_adNetworkType', 'day']


4 Create Dummy Variables

In [211]:
less_than_10 = ['channelGrouping', 'cD_value', 'd_deviceCategory', 'geo_continent', 'ts_medium', 
 'ts_ad_adNetworkType', 'day']

for col in less_than_10:
    combi_new2[col] = combi_new2[col].apply(lambda x: str(x).replace(" ","_")) # change name
    
dummy = pd.DataFrame()

for col in less_than_10:
    if col.find('_') != -1: # there's no '_' in the column name
        col_name = col.split('_')[1]
    else:
        col_name = col
    dummy = pd.concat([dummy, pd.get_dummies(combi_new2[col], prefix = col_name)], axis=1) # change name
    
print('Newly created dummy cols:', len(dummy.columns))

# drop and combine
combi_new2 = pd.concat([combi_new2, dummy], axis=1)
combi_new2.drop(less_than_10, axis=1, inplace=True)

Newly created dummy cols: 37


In [213]:
combi_new2.to_csv('Data_to_process.csv', index=False, header=True)

5 for objects with too much categories

In [214]:
[x for x in objects if x not in less_than_10]

['fullVisitorId',
 'd_browser',
 'd_operatingSystem',
 'geo_city',
 'geo_country',
 'geo_metro',
 'geo_networkDomain',
 'geo_region',
 'geo_subContinent',
 'ts_campaign',
 'ts_source']

- create column of whether used chrome or not

In [220]:
# convert to dummy variables 
chrome = ['chrome' if x == 'chrome' else 'other' for x in combi_new2['d_browser']]
combi_new2['chrome'] = chrome

In [224]:
combi_new2.drop('ts_campaign', axis=1, inplace=True) # too much not set

- create column of whether souces comes from direct, google or others

In [234]:
direct_index = ['(direct)' in x for x in combi_new2['ts_source']]
google_index = ['google' in x for x in combi_new2['ts_source']]

In [241]:
source = np.array(['other'] * len(combi_new2['ts_source']))
source[direct_index] = 'direct'
source[google_index] = 'google'

In [242]:
combi_new2['source'] = source

In [244]:
# convert source and chrome columns to dummies
l = ['source', 'chrome']

for col in l:
    combi_new2[col] = combi_new2[col].apply(lambda x: str(x).replace(" ","_")) # change name
    
dummy1 = pd.DataFrame()

for col in l:
    if col.find('_') != -1: # there's no '_' in the column name
        col_name = col.split('_')[1]
    else:
        col_name = col
    dummy1 = pd.concat([dummy1, pd.get_dummies(combi_new2[col], prefix = col_name)], axis=1) # change name
    
print('Newly created dummy1 cols:', len(dummy1.columns))

# drop and combine
combi_new2 = pd.concat([combi_new2, dummy1], axis=1)
combi_new2.drop(l, axis=1, inplace=True)

Newly created dummy1 cols: 5


In [245]:
combi_new2.to_csv('Data_to_process.csv', index=False, header=True)

In [246]:
combi_new3 = combi_new2.copy() 

6 generate ranks for rest of the object features

In [247]:
combi_new3['totals_transactionRevenue'] = combi_new3['t_transactionRevenue']

In [248]:
combi_new3.drop('t_transactionRevenue', axis=1, inplace=True)

In [250]:
%%time
#RANKS ARE GENERATED USING REVENUE PERCENTAGE
cols = [x for x in combi_new3.columns if x not in ['fullVisitorId','geo_networkDomain']]

for col in cols:
    if combi_new3[col].dtype == 'object':
        combi_new3[col].fillna('others', inplace=True)
        col_list = ['revenue_status', 'totals_transactionRevenue']
        col_list.append(col)
        print(col_list)
        df = combi_new3[col_list].groupby(col).aggregate({col:['count'],'revenue_status':['sum'],'totals_transactionRevenue':['sum']}).reset_index()
        df.columns = [col, col + "_count",'revenue_status_sum','totals_transactionRevenue_sum']
        df['revenue_perc'] = df['totals_transactionRevenue_sum'] / df[col + "_count"]
        df['rank'] = df['revenue_perc'].rank(ascending=1)
        
        replace_dict={}
        final_dict={}
        
        for k, col_val in enumerate(df[col].values):
            replace_dict[col_val] = df.iloc[k, 5]
        final_dict[col] = replace_dict
        combi_new3.replace(final_dict, inplace=True)
        del df, replace_dict, final_dict
        gc.collect()

['revenue_status', 'totals_transactionRevenue', 'd_browser']
['revenue_status', 'totals_transactionRevenue', 'd_operatingSystem']
['revenue_status', 'totals_transactionRevenue', 'geo_city']
['revenue_status', 'totals_transactionRevenue', 'geo_country']
['revenue_status', 'totals_transactionRevenue', 'geo_metro']
['revenue_status', 'totals_transactionRevenue', 'geo_region']
['revenue_status', 'totals_transactionRevenue', 'geo_subContinent']
['revenue_status', 'totals_transactionRevenue', 'ts_source']
CPU times: user 5min 10s, sys: 19.5 s, total: 5min 29s
Wall time: 5min 33s


In [255]:
combi_new3.to_csv('Data_to_process2.csv', index=False, header=True)

#### split back to train and test for tfidf and then modeling

In [253]:
#SPLITING COMBINED DATASET BACK TO TRAIN AND TEST SETS
train = combi_new3[:len(train_cp)]
test = combi_new3[len(train_cp):]

In [256]:
%%time
#REPLACING DOT WITH SPACE IN FEATURE geoNetwork_networkDomain
#THIS IS DONE TO TREAT IT AS TEXT AND WE WILL USE TfidfVectorizer TO EXTRACT FEATURES
train['geo_networkDomain'].fillna('unknown.unknown',inplace=True)
test['geo_networkDomain'].fillna('unknown.unknown',inplace=True)

train['geo_networkDomain'] = train.geo_networkDomain.apply(lambda x: x.replace('.',' '))
test['geo_networkDomain'] = test.geo_networkDomain.apply(lambda x: x.replace('.',' '))

CPU times: user 3.2 s, sys: 1.73 s, total: 4.93 s
Wall time: 5.81 s


In [257]:
%%time
#USING TfidfVectorizer TO EXTRACT FEATURES FROM geoNetwork_networkDomain
Tvect = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
vect = Tvect.fit(train['geo_networkDomain'])   
train_vect = vect.transform(train['geo_networkDomain'])
test_vect = vect.transform(test['geo_networkDomain'])

# DIMENSIONALITY REDUCTION ON EXTRACTED FEATURES
svd = TruncatedSVD(n_components=10)

#CREATING DATAFRAMES AFTER FEATURE EXTRACTION AND REDUCTION
vect_cols = ['vect' + str(x) for x in range(1,11)]
df_train_vect = pd.DataFrame(svd.fit_transform(train_vect), columns=vect_cols)
df_test_vect = pd.DataFrame(svd.fit_transform(test_vect), columns=vect_cols)

CPU times: user 48.8 s, sys: 6.08 s, total: 54.9 s
Wall time: 53.3 s


In [258]:
print(train_vect.shape, test_vect.shape)
display(df_train_vect.head())
display(df_test_vect.head())
print('Shape of vector dataframes:', df_train_vect.shape, df_test_vect.shape)

(1708337, 20000) (401589, 20000)


Unnamed: 0,vect1,vect2,vect3,vect4,vect5,vect6,vect7,vect8,vect9,vect10
0,1.0,-9.251834e-12,-1.093621e-10,-5.949542e-11,-1.930414e-10,1.251464e-10,-3.347966e-10,-1.622505e-10,1.83121e-11,-8.393805e-10
1,1.0,-3.961597e-12,-1.094051e-10,-5.950089e-11,-1.930611e-10,1.251518e-10,-3.347934e-10,-1.622701e-10,1.828259e-11,-8.394051e-10
2,6.41271e-12,6.250596e-07,0.07693836,0.0001277597,0.06395778,-0.003309907,0.04990474,-0.008255306,0.0004518628,0.02962271
3,1.952481e-12,1.0,-3.042682e-06,-6.194191e-05,-2.115704e-06,2.571898e-05,1.396248e-07,8.370089e-07,2.044592e-05,-3.287632e-08
4,-4.226161e-10,1.480254e-05,0.001565125,0.2194428,0.002344751,-0.16524,0.005308407,-0.003278433,-0.1831914,0.03027699


Unnamed: 0,vect1,vect2,vect3,vect4,vect5,vect6,vect7,vect8,vect9,vect10
0,5.735426e-13,1.0,-1.509412e-05,-6.138458e-05,-1.122842e-05,-6.68191e-06,2.601234e-05,-4.30513e-06,-4.658447e-06,-4.867325e-06
1,1.0,-2.506128e-13,4.384361e-15,-3.705962e-15,-1.436898e-15,-3.607805e-15,2.848238e-15,-2.022099e-15,8.250893e-16,-1.050958e-15
2,-9.682272e-13,6.567418e-05,0.0006403984,0.8968924,-0.001145965,0.003347299,-0.4161017,-0.002604648,-0.001591744,-0.0008931802
3,1.0,-2.168263e-13,-7.993588e-15,-2.569719e-15,2.312952e-13,3.439693e-12,-1.774038e-12,1.606563e-12,1.290556e-12,1.152634e-12
4,-9.682385e-13,6.567418e-05,0.0006403984,0.8968924,-0.001145965,0.003347299,-0.4161017,-0.002604648,-0.001591744,-0.0008931802


Shape of vector dataframes: (1708337, 10) (401589, 10)


# Prepare for Modeling

In [259]:
X = train.drop(['visitId','date','geo_networkDomain'],axis=1)
X_test = test.drop(['visitId','date','geo_networkDomain'],axis=1) 

In [260]:
X_test.reset_index(drop=True, inplace=True)
X_test.head()

Unnamed: 0,fullVisitorId,visitNumber,visitStartTime,d_browser,d_isMobile,d_operatingSystem,geo_city,geo_country,geo_metro,geo_region,...,ad_search_partners,day_b,day_e,day_m,source_direc,source_googl,source_other,chrome_chrome,chrome_other,totals_transactionRevenue
0,7460955084541987166,2,1526099341,160.0,True,21.0,968.0,172.0,76.0,428.0,...,0,0,0,1,0,1,0,1,0,0.0
1,460252456180441002,166,1526064483,160.0,False,25.0,1064.0,226.0,112.0,506.0,...,0,0,0,1,1,0,0,1,0,0.0
2,3461808543879602873,2,1526067157,160.0,False,26.0,1009.0,226.0,94.0,489.0,...,0,0,0,1,0,1,0,1,0,0.0
3,975129477712150630,4,1526107551,160.0,True,22.0,1018.0,226.0,98.0,504.0,...,0,0,0,1,1,0,0,1,0,0.0
4,8381672768065729990,1,1526060254,158.0,True,23.0,1068.0,226.0,118.0,506.0,...,0,0,0,1,0,1,0,0,1,0.0


In [261]:
# CONCATENATING WITH TEXT FEATURES 
X = pd.concat([X, df_train_vect], axis=1)
X_test = pd.concat([X_test, df_test_vect], axis=1)

In [266]:
#VIEW OF TRAIN AND TEST DATASET SHAPE
print('Before creating aggregated features')
print('Train shape:', X.shape,' Test shape:', X_test.shape)

Before creating aggregated features
Train shape: (1708337, 77)  Test shape: (401589, 77)


In [267]:
X.to_csv('final_train.csv', index=False, header=True)
X_test.to_csv('final_test.csv', index=False, header=True)

In [7]:
%%time
X = pd.read_csv('final_train.csv')
X_test = pd.read_csv('final_test.csv')

CPU times: user 16.5 s, sys: 2.16 s, total: 18.7 s
Wall time: 18.8 s


In [8]:
%%time
agg_func = {}
agg_col = ['fullVisitorId']
for col in [x for x in X.columns if x not in ['fullVisitorId']]:
    if col == 'totals_transactionRevenue':
        agg_func[col] = ['sum']
        agg_col.append(str(col) + '_sum')
    elif col == 'revenue_status':
        agg_func[col] = ['sum']
        agg_col.append(str(col) + '_sum')
    else:
        agg_func[col] = ['sum','max','min','mean']
        agg_col.append(str(col) + '_sum')
        agg_col.append(str(col) + '_max')
        agg_col.append(str(col) + '_min')
        agg_col.append(str(col) + '_mean')

    
X = X.groupby(X.fullVisitorId).aggregate(agg_func).reset_index()
X.columns = agg_col

X_test = X_test.groupby(X_test.fullVisitorId).aggregate(agg_func).reset_index()
X_test.columns = agg_col

CPU times: user 35.8 s, sys: 15.3 s, total: 51.1 s
Wall time: 51.1 s


In [9]:
print('After creating aggregated features')
print('Train shape:', X.shape, 'Test shape:', X_test.shape)

After creating aggregated features
Train shape: (1351930, 299) Test shape: (298451, 299)


In [14]:
%%time

#CREATING y_dummy FOR USING STRATIFIED KFOLD
y_dummy = X.revenue_status_sum.apply(lambda x: 0 if x==0 else 1)

#TARGET FEATURE CONVERTED TO NATURAL LOG
# y=pd.Series(X['totals_transactionRevenue_sum'])
y = X.totals_transactionRevenue_sum.apply(lambda x: np.log1p(x + 1))  ###### change !!!1

#PEPARING DATA FOR TRAINING LGBM MODEL
X = X.drop(['totals_transactionRevenue_sum','fullVisitorId','revenue_status_sum'], axis=1)

#FINAL DATAFRAME FOR SUBMISSION
col = ['fullVisitorId', 'totals_transactionRevenue_sum']
final = X_test[col] 
final.columns = ['fullVisitorId', 'PredictedLogRevenue']

#FINAL TEST FEATURES USED FOR PREDICTING SUBMISSION
X_test = X_test.drop(['fullVisitorId', 'totals_transactionRevenue_sum', 'revenue_status_sum'], axis=1)

CPU times: user 3.25 s, sys: 1.65 s, total: 4.9 s
Wall time: 4.9 s


In [15]:
print('After creating aggregated features')
print('Train shape:',X.shape,' Test shape:',X_test.shape)

After creating aggregated features
Train shape: (1351930, 296)  Test shape: (298451, 296)


# Modeling

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)

In [11]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

## LGBM

In [22]:
def run_lgb(X_train, y_train, X_val, y_val, X_test):
    
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgb_train_data = lgb.Dataset(X_train, label=y_train)
    lgb_val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, lgb_train_data, 
                      num_boost_round=5000,
                      valid_sets=[lgb_train_data, lgb_val_data],
                      early_stopping_rounds=100,
                      verbose_eval=500)

    y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_submit = model.predict(X_test, num_iteration=model.best_iteration)

    print(f"LGBM: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [23]:
%%time
# Train LGBM and generate predictions
lgb_preds, lgb_model = run_lgb(X_train, y_train, X_val, y_val, X_test)

Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 1.42971	valid_1's rmse: 1.43317
[1000]	training's rmse: 1.38804	valid_1's rmse: 1.4128
[1500]	training's rmse: 1.36425	valid_1's rmse: 1.40775
[2000]	training's rmse: 1.34426	valid_1's rmse: 1.40636
[2500]	training's rmse: 1.32702	valid_1's rmse: 1.40525
[3000]	training's rmse: 1.30918	valid_1's rmse: 1.404
[3500]	training's rmse: 1.29293	valid_1's rmse: 1.40308
[4000]	training's rmse: 1.27813	valid_1's rmse: 1.40239
Early stopping, best iteration is:
[4251]	training's rmse: 1.27123	valid_1's rmse: 1.40221
LGBM: RMSE val: 1.40221  - RMSE train: 1.27123
CPU times: user 2h 16min 30s, sys: 1min 39s, total: 2h 18min 9s
Wall time: 3min 22s


In [77]:
print("LightGBM features importance...")
gain = lgb_model.feature_importance('gain')
featureimp = pd.DataFrame({'feature': lgb_model.feature_name(), 
                   'split': lgb_model.feature_importance('split'), 
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:10])

LightGBM features importance...
                    feature       gain  split
60          t_pageviews_sum  22.719096   5214
61          t_pageviews_max   8.461591   5841
52               t_hits_sum   8.193397   4685
65  t_sessionQualityDim_max   6.163437   3701
25          geo_country_max   5.560187   1064
53               t_hits_max   3.176678   6629
27         geo_country_mean   3.112183   1068
69         t_timeOnSite_max   2.645732   5347
68         t_timeOnSite_sum   2.249304   4182
63         t_pageviews_mean   1.932196   4561


## XGBOOST

In [17]:
def run_xgb(X_train, y_train, X_val, y_val, X_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 10,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=2000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    print(f"XGB : RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [18]:
%%time
xgb_preds, xgb_model = run_xgb(X_train, y_train, X_val, y_val, X_test)

[0]	train-rmse:1.91787	valid-rmse:1.90294
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:1.60746	valid-rmse:1.62829
[1000]	train-rmse:1.4481	valid-rmse:1.50608
[1500]	train-rmse:1.361	valid-rmse:1.45289
[1999]	train-rmse:1.30783	valid-rmse:1.42939
XGB : RMSE val: 1.42939  - RMSE train: 1.30783
CPU times: user 1d 6h 1min 20s, sys: 10min 25s, total: 1d 6h 11min 46s
Wall time: 29min 2s


## Catboost

In [19]:
def run_catboost(X_train, y_train, X_val, y_val, X_test):
    model = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              use_best_model=True,
              )
    
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_submit = model.predict(X_test)

    print(f"CatB: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [20]:
%%time
# Train Catboost and generate predictions
cat_preds, cat_model = run_catboost(X_train, y_train, X_val, y_val,  X_test)



0:	learn: 2.0329233	test: 2.0183892	best: 2.0183892 (0)	total: 493ms	remaining: 8m 12s
50:	learn: 1.4423691	test: 1.4447359	best: 1.4447359 (50)	total: 22.1s	remaining: 6m 50s
100:	learn: 1.4031941	test: 1.4269778	best: 1.4269778 (100)	total: 43.6s	remaining: 6m 28s
150:	learn: 1.3827296	test: 1.4215333	best: 1.4215333 (150)	total: 1m 4s	remaining: 6m 5s
200:	learn: 1.3640552	test: 1.4180560	best: 1.4180560 (200)	total: 1m 26s	remaining: 5m 43s
250:	learn: 1.3476389	test: 1.4164553	best: 1.4164553 (250)	total: 1m 47s	remaining: 5m 21s
300:	learn: 1.3294905	test: 1.4145921	best: 1.4145921 (300)	total: 2m 8s	remaining: 4m 58s
350:	learn: 1.3130326	test: 1.4131493	best: 1.4131493 (350)	total: 2m 29s	remaining: 4m 36s
400:	learn: 1.2991994	test: 1.4124578	best: 1.4123342 (399)	total: 2m 50s	remaining: 4m 14s
450:	learn: 1.2848335	test: 1.4117972	best: 1.4116794 (446)	total: 3m 10s	remaining: 3m 52s
500:	learn: 1.2707556	test: 1.4109628	best: 1.4109628 (500)	total: 3m 31s	remaining: 3m 30s


## Ensemble

In [24]:
# Note: this is currently being reconstructed!
ensemble_preds_70_30_00 = 0.7 * lgb_preds + 0.3 * cat_preds + 0.0 * xgb_preds 
ensemble_preds_70_25_05 = 0.7 * lgb_preds + 0.25 * cat_preds + 0.05 * xgb_preds 