In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import gc
gc. enable()
%matplotlib inline
pd.options.display.max_columns = 999

In [2]:
train = pd.read_csv("input/flattened/extracted_fields_train.gz", 
                    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str, 'visitId': np.int64})
test = pd.read_csv("input/flattened/extracted_fields_test.gz",
                    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str, 'visitId': np.int64})


In [3]:
train.shape, test.shape

((903653, 30), (804684, 30))

In [4]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,trafficSource.adContent,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1.0,1472830000.0,Chrome,desktop,0.0,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1.0,1.0,1.0,,,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1.0,1472880000.0,Firefox,desktop,0.0,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1.0,1.0,1.0,,,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1.0,1472865000.0,Chrome,desktop,0.0,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1.0,1.0,1.0,,,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1.0,1472881000.0,UC Browser,desktop,0.0,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1.0,1.0,1.0,,,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2.0,1472823000.0,Chrome,mobile,1.0,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1.0,,1.0,,,(not set),1.0,(not provided),organic,,google


In [5]:
train_store_1 = pd.read_csv('input/external_data/Train_external_data.csv', low_memory=False, skiprows=6, dtype={"Client Id":'str'})
train_store_2 = pd.read_csv('input/external_data/Train_external_data_2.csv', low_memory=False, skiprows=6, dtype={"Client Id":'str'})
test_store_1 = pd.read_csv('input/external_data/Test_external_data.csv', low_memory=False, skiprows=6, dtype={"Client Id":'str'})
test_store_2 = pd.read_csv('input/external_data/Test_external_data_2.csv', low_memory=False, skiprows=6, dtype={"Client Id":'str'})

In [6]:
for df in [train_store_1, train_store_2, test_store_1, test_store_2]:
    df["visitId"] = df["Client Id"].apply(lambda x: x.split('.', 1)[1]).astype(np.int64)

In [9]:
train = train.merge(pd.concat([train_store_1, train_store_2], sort=False), how="left", on="visitId")
test = test.merge(pd.concat([test_store_1, test_store_2], sort=False), how="left", on="visitId")

for df in [train, test]:
    df.drop("Client Id", 1, inplace=True)

In [10]:
for df in [train, test]:
    df["Revenue"].fillna('$', inplace=True)
    df["Revenue"] = df["Revenue"].apply(lambda x: x.replace('$', '').replace(',', ''))
    df["Revenue"] = pd.to_numeric(df["Revenue"], errors="coerce")
    df["Revenue"].fillna(0.0, inplace=True)

In [11]:
for df in [train_store_1, train_store_2, test_store_1, test_store_2]:
    del df
gc.collect()

49

In [None]:
train.to_csv("train_concated.csv", index=False)
test.to_csv("test_concated.csv", index=False)

In [None]:
train = pd.read_csv("input/concated/train_concated.csv", low_memory=False)
test = pd.read_csv("input/concated/test_concated.csv", low_memory=False)

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 903654 entries, 0 to 903653
Data columns (total 36 columns):
channelGrouping               903654 non-null object
date                          903654 non-null object
fullVisitorId                 903654 non-null object
sessionId                     903654 non-null object
visitId                       903654 non-null int64
visitNumber                   903654 non-null float64
visitStartTime                903654 non-null float64
device.browser                903654 non-null object
device.deviceCategory         903654 non-null object
device.isMobile               903654 non-null float64
device.operatingSystem        903654 non-null object
geoNetwork.city               903654 non-null object
geoNetwork.continent          903654 non-null object
geoNetwork.country            903654 non-null object
geoNetwork.metro              903654 non-null object
geoNetwork.networkDomain      903654 non-null object
geoNetwork.region             903654 no

In [13]:
train.visitStartTime = pd.to_datetime(train.visitStartTime, unit='s')
test.visitStartTime = pd.to_datetime(test.visitStartTime, unit='s')
train["date"] = train.visitStartTime
test["date"] = test.visitStartTime

In [14]:
train.set_index("visitStartTime", inplace=True)
test.set_index("visitStartTime", inplace=True)
train.sort_index(inplace=True)
test.sort_index(inplace=True)

In [15]:
def clearRare(columnname, limit = 1000):
    vc = test[columnname].value_counts()
    
    common = vc > limit
    common = set(common.index[common].values)
    print("Set", sum(vc <= limit), columnname, "categories to 'other';", end=" ")
    
    train.loc[train[columnname].map(lambda x: x not in common), columnname] = 'other'
    test.loc[test[columnname].map(lambda x: x not in common), columnname] = 'other'
    print("now there are", train[columnname].nunique(), "categories in train")

In [17]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [16]:
clearRare("device.browser")
clearRare("device.operatingSystem")
clearRare("geoNetwork.country")
clearRare("geoNetwork.city")
clearRare("geoNetwork.metro")
clearRare("geoNetwork.networkDomain")
clearRare("geoNetwork.region")
clearRare("geoNetwork.subContinent")
clearRare("trafficSource.adContent")
clearRare("trafficSource.campaign")
clearRare("trafficSource.keyword")
clearRare("trafficSource.medium")
clearRare("trafficSource.referralPath")
clearRare("trafficSource.source")

Set 98 device.browser categories to 'other'; now there are 11 categories in train
Set 15 device.operatingSystem categories to 'other'; now there are 8 categories in train
Set 160 geoNetwork.country categories to 'other'; now there are 60 categories in train
Set 656 geoNetwork.city categories to 'other'; now there are 77 categories in train
Set 86 geoNetwork.metro categories to 'other'; now there are 24 categories in train
Set 25689 geoNetwork.networkDomain categories to 'other'; now there are 62 categories in train
Set 314 geoNetwork.region categories to 'other'; now there are 62 categories in train
Set 5 geoNetwork.subContinent categories to 'other'; now there are 19 categories in train
Set 46 trafficSource.adContent categories to 'other'; now there are 3 categories in train
Set 23 trafficSource.campaign categories to 'other'; now there are 5 categories in train
Set 2409 trafficSource.keyword categories to 'other'; now there are 7 categories in train
Set 1 trafficSource.medium categor

In [18]:
for df in [train, test]:
    df["Avg. Session Duration"][df["Avg. Session Duration"] == 0] = "00:00:00"
    df["Avg. Session Duration"] = df["Avg. Session Duration"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
    df["Bounce Rate"] = df["Bounce Rate"].astype(str).apply(lambda x: x.replace('%', '')).astype(float)
    df["Goal Conversion Rate"] = df["Goal Conversion Rate"].astype(str).apply(lambda x: x.replace('%', '')).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
for df in [train, test]:
    df["id_incoherence"] = pd.to_datetime(df.visitId, unit='s') != df.date
    df["visitId_dublicates"] = df.visitId.map(df.visitId.value_counts())
    df["session_dublicates"] = df.sessionId.map(df.sessionId.value_counts())

In [20]:
for df in [train, test]:
    df['weekday'] = df['date'].dt.dayofweek.astype(object)
    df['time'] = df['date'].dt.second + df['date'].dt.minute*60 + df['date'].dt.hour*3600
    df['day'] = df['date'].dt.date   

In [23]:
df = pd.concat([train, test])
df.sort_values(['fullVisitorId', 'date'], ascending=True, inplace=True)
df['prev_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(1)).astype(np.int64) // 1e9 // 60 // 60
df['next_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(-1)).astype(np.int64) // 1e9 // 60 // 60
df.sort_index(inplace=True)

train = df[:len(train)]
test = df[len(train):]

In [21]:
def browser_mapping(x):
    browsers = ['chrome','safari','firefox','internet explorer','edge','opera','coc coc','maxthon','iron']
    if x in browsers:
        return x.lower()
    elif  ('android' in x) or ('samsung' in x) or ('mini' in x) or ('iphone' in x) or ('in-app' in x) or ('playstation' in x):
        return 'mobile browser'
    elif  ('mozilla' in x) or ('chrome' in x) or ('blackberry' in x) or ('nokia' in x) or ('browser' in x) or ('amazon' in x):
        return 'mobile browser'
    elif  ('lunascape' in x) or ('netscape' in x) or ('blackberry' in x) or ('konqueror' in x) or ('puffin' in x) or ('amazon' in x):
        return 'mobile browser'
    elif '(not set)' in x:
        return x
    else:
        return 'others'
    
    
def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif '(not set)' in x:
        return '(not set)'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

train['device.browser'] = train['device.browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
train['trafficSource.adContent'] = train['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
train['trafficSource.source'] = train['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

test['device.browser'] = test['device.browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
test['trafficSource.adContent'] = test['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
test['trafficSource.source'] = test['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

def process_device(data_df):
    print("process device ...")
    data_df['source.country'] = data_df['trafficSource.source'] + '_' + data_df['geoNetwork.country']
    data_df['campaign.medium'] = data_df['trafficSource.campaign'] + '_' + data_df['trafficSource.medium']
    data_df['browser.category'] = data_df['device.browser'] + '_' + data_df['device.deviceCategory']
    data_df['browser.os'] = data_df['device.browser'] + '_' + data_df['device.operatingSystem']
    return data_df

train = process_device(train)
test = process_device(test)

def custom(data):
    print('custom..')
    data['device_deviceCategory_channelGrouping'] = data['device.deviceCategory'] + "_" + data['channelGrouping']
    data['channelGrouping_browser'] = data['device.browser'] + "_" + data['channelGrouping']
    data['channelGrouping_OS'] = data['device.operatingSystem'] + "_" + data['channelGrouping']
    
    for i in ['geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country','geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region','geoNetwork.subContinent']:
        for j in ['device.browser','device.deviceCategory', 'device.operatingSystem', 'trafficSource.source']:
            data[i + "_" + j] = data[i] + "_" + data[j]
    
    data['content.source'] = data['trafficSource.adContent'] + "_" + data['source.country']
    data['medium.source'] = data['trafficSource.medium'] + "_" + data['source.country']
    return data

train = custom(train)
test = custom(test)

process device ...
process device ...
custom..
custom..


In [25]:
for feature in ["totals.hits", "totals.pageviews"]:
    info = pd.concat([train, test], sort=False).groupby("fullVisitorId")[feature].mean()
    train["usermean_" + feature] = train.fullVisitorId.map(info)
    test["usermean_" + feature] = test.fullVisitorId.map(info)
    
for feature in ["visitNumber"]:
    info = pd.concat([train, test], sort=False).groupby("fullVisitorId")[feature].max()
    train["usermax_" + feature] = train.fullVisitorId.map(info)
    test["usermax_" + feature] = test.fullVisitorId.map(info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [24]:
train['hits_by_pageviews'] = train['totals.hits'].astype(float) / train['totals.pageviews'].astype(float)
test['hits_by_pageviews'] = test['totals.hits'].astype(float) / test['totals.pageviews'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 903654 entries, 2016-08-01 07:00:12 to 2017-08-02 06:59:53
Data columns (total 78 columns):
channelGrouping                                    903654 non-null object
date                                               903654 non-null datetime64[ns]
fullVisitorId                                      903654 non-null object
sessionId                                          903654 non-null object
visitId                                            903654 non-null int64
visitNumber                                        903654 non-null float64
device.browser                                     903654 non-null object
device.deviceCategory                              903654 non-null object
device.isMobile                                    903654 non-null float64
device.operatingSystem                             903654 non-null object
geoNetwork.city                                    903654 non-null object
geoNetwork.continent            

In [26]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime', 
            'month', 'day']

cat_cols = [f for f in train.columns if (train[f].dtype == 'object' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

In [27]:
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
for col in real_cols:
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
train[real_cols + cat_cols].head()

Unnamed: 0_level_0,visitNumber,device.isMobile,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,trafficSource.isTrueDirect,Sessions,Avg. Session Duration,Bounce Rate,Revenue,Transactions,Goal Conversion Rate,id_incoherence,visitId_dublicates,session_dublicates,time,prev_session,next_session,hits_by_pageviews,usermean_totals.hits,usermean_totals.pageviews,usermax_visitNumber,channelGrouping,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,trafficSource.adContent,trafficSource.campaign,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,weekday,source.country,campaign.medium,browser.category,browser.os,device_deviceCategory_channelGrouping,channelGrouping_browser,channelGrouping_OS,geoNetwork.city_device.browser,geoNetwork.city_device.deviceCategory,geoNetwork.city_device.operatingSystem,geoNetwork.city_trafficSource.source,geoNetwork.continent_device.browser,geoNetwork.continent_device.deviceCategory,geoNetwork.continent_device.operatingSystem,geoNetwork.continent_trafficSource.source,geoNetwork.country_device.browser,geoNetwork.country_device.deviceCategory,geoNetwork.country_device.operatingSystem,geoNetwork.country_trafficSource.source,geoNetwork.metro_device.browser,geoNetwork.metro_device.deviceCategory,geoNetwork.metro_device.operatingSystem,geoNetwork.metro_trafficSource.source,geoNetwork.networkDomain_device.browser,geoNetwork.networkDomain_device.deviceCategory,geoNetwork.networkDomain_device.operatingSystem,geoNetwork.networkDomain_trafficSource.source,geoNetwork.region_device.browser,geoNetwork.region_device.deviceCategory,geoNetwork.region_device.operatingSystem,geoNetwork.region_trafficSource.source,geoNetwork.subContinent_device.browser,geoNetwork.subContinent_device.deviceCategory,geoNetwork.subContinent_device.operatingSystem,geoNetwork.subContinent_trafficSource.source,content.source,medium.source
visitStartTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1
2016-08-01 07:00:12,3.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,False,1,1,25212,-2562048.0,-2562048.0,1.0,1.0,1.0,3.0,2,7,1,6,75,2,57,22,0,61,8,2,1,6,0,24,3,0,199,1,22,43,10,56,50,590,226,564,753,23,7,22,26,463,172,459,593,182,67,170,241,7,1,6,3,491,184,471,615,71,25,70,91,317,57
2016-08-01 07:04:26,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,False,1,1,25466,-2562048.0,-2562048.0,1.0,1.0,1.0,1.0,7,0,0,5,75,3,52,22,51,61,11,2,1,6,6,14,11,0,616,7,0,5,7,7,47,583,225,563,761,24,9,29,46,416,156,418,546,175,66,169,249,403,153,405,530,484,183,470,623,88,33,93,134,734,829
2016-08-01 07:04:41,1.0,1.0,0.0,5.0,1.0,5.0,0.0,0.0,0,0.0,0.0,0.0,0.0,False,1,1,25481,-2562048.0,-2562048.0,1.0,5.0,5.0,1.0,4,6,2,1,75,2,57,22,0,61,8,2,1,3,4,24,5,0,316,5,20,33,20,50,12,589,227,559,755,22,8,17,28,462,173,454,595,181,68,165,243,6,2,1,5,490,185,466,617,70,26,65,93,434,393
2016-08-01 07:06:01,1.0,0.0,0.0,9.0,1.0,7.0,0.0,0.0,0,0.0,0.0,0.0,0.0,False,1,1,25561,-2562048.0,-2562048.0,1.285714,9.0,7.0,1.0,4,0,0,5,75,2,8,22,8,61,8,2,1,3,4,24,5,0,267,5,0,5,4,4,44,583,225,563,755,16,6,21,28,64,24,69,87,175,66,169,243,64,24,69,88,484,183,470,617,64,24,69,93,385,344
2016-08-01 07:06:10,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,False,1,1,25570,-2562048.0,-2562048.0,1.0,1.0,1.0,1.0,2,0,0,5,76,3,36,0,29,62,11,2,1,6,0,24,3,0,178,1,0,5,2,2,42,591,228,571,765,24,9,29,38,288,108,290,375,0,0,5,3,230,87,233,302,492,186,478,627,88,33,93,126,296,36


In [28]:
for to_del in ["date", "sessionId", "visitId", "day"]:
    del train[to_del]
    del test[to_del]

In [33]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime', "month"]

cat_cols = [f for f in train.columns if (train[f].dtype == 'int64' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

In [32]:
from sklearn.metrics import mean_squared_error
def score(data, y):
    validation_res = pd.DataFrame(
    {"fullVisitorId": data["fullVisitorId"].values,
     "transactionRevenue": data["totals.transactionRevenue"].values,
     "predictedRevenue": np.expm1(y)})

    validation_res = validation_res.groupby("fullVisitorId")["transactionRevenue", "predictedRevenue"].sum().reset_index()
    return np.sqrt(mean_squared_error(np.log1p(validation_res["transactionRevenue"].values), 
                                     np.log1p(validation_res["predictedRevenue"].values)))

In [31]:
from sklearn.model_selection import GroupKFold

class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['fullVisitorId'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["totals.transactionRevenue"].iloc[trn])
            valid = train[features].iloc[val]
            y_valid = np.log1p(train["totals.transactionRevenue"].iloc[val])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(train.iloc[val], predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final score: ", full_score)
        return full_score

In [35]:
Kfolder = KFoldValidation(train)

In [36]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9, use_best_model=True)

In [37]:
Kfolder.validate(train, test, real_cols + cat_cols, lgbmodel, "lgbpred", prepare_stacking=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  0 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.59824
[200]	valid_0's rmse: 1.57245
[300]	valid_0's rmse: 1.56688
[400]	valid_0's rmse: 1.56362
[500]	valid_0's rmse: 1.56296
[600]	valid_0's rmse: 1.56225
Early stopping, best iteration is:
[599]	valid_0's rmse: 1.56223
Fold  0  error:  1.5612291334966077
Fold  0  score:  1.5427372910573272


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  1 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.57391
[200]	valid_0's rmse: 1.55275
[300]	valid_0's rmse: 1.54638
[400]	valid_0's rmse: 1.54373
[500]	valid_0's rmse: 1.54194
[600]	valid_0's rmse: 1.54169
Early stopping, best iteration is:
[562]	valid_0's rmse: 1.54146
Fold  1  error:  1.5406878739278502
Fold  1  score:  1.5442829895370724


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  2 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.58613
[200]	valid_0's rmse: 1.56178
[300]	valid_0's rmse: 1.55494
[400]	valid_0's rmse: 1.55085
[500]	valid_0's rmse: 1.54949
[600]	valid_0's rmse: 1.54866
[700]	valid_0's rmse: 1.5478
Early stopping, best iteration is:
[709]	valid_0's rmse: 1.54749
Fold  2  error:  1.5463089321760883
Fold  2  score:  1.5356962603256186


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  3 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.56907
[200]	valid_0's rmse: 1.54805
[300]	valid_0's rmse: 1.54233
[400]	valid_0's rmse: 1.53961
[500]	valid_0's rmse: 1.53822
[600]	valid_0's rmse: 1.53784
Early stopping, best iteration is:
[646]	valid_0's rmse: 1.5375
Fold  3  error:  1.5368247834215583
Fold  3  score:  1.5198222218819184


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  4 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.60943
[200]	valid_0's rmse: 1.58822
[300]	valid_0's rmse: 1.58139
[400]	valid_0's rmse: 1.57857
[500]	valid_0's rmse: 1.57699
Early stopping, best iteration is:
[530]	valid_0's rmse: 1.57677
Fold  4  error:  1.5758534101467758
Fold  4  score:  1.5551039206370094


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Final score:  1.539528536687789


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1.539528536687789

In [38]:
def create_user_df(df):
    agg_data = df[real_cols + cat_cols + ['fullVisitorId']].groupby('fullVisitorId').mean()
    
    pred_list = df[['fullVisitorId', 'lgbpred']].groupby('fullVisitorId').apply(lambda visitor_df: list(visitor_df.lgbpred))\
        .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})
    all_predictions = pd.DataFrame(list(pred_list.values), index=agg_data.index)
    feats = all_predictions.columns

    all_predictions['t_mean'] = all_predictions.mean(axis=1)
    all_predictions['t_median'] = all_predictions.median(axis=1)   
    all_predictions['t_sum_log'] = all_predictions.sum(axis=1)
    all_predictions['t_sum_act'] = all_predictions.fillna(0).sum(axis=1)
    all_predictions['t_nb_sess'] = all_predictions.isnull().sum(axis=1)

    full_data = pd.concat([agg_data, all_predictions], axis=1).astype(float)
    full_data['fullVisitorId'] = full_data.index
    del agg_data, all_predictions
    gc.collect()
    return full_data

In [39]:
user_train = create_user_df(train)
user_test = create_user_df(test)

In [40]:
features = list(user_train.columns)[:-1]  # don't include "fullVisitorId"
user_train["totals.transactionRevenue"] = train[['fullVisitorId', 'totals.transactionRevenue']].groupby('fullVisitorId').sum()

In [41]:
for f in features:
    if f not in user_test.columns:
        user_test[f] = np.nan

In [43]:
Kfolder = KFoldValidation(user_train)

In [42]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9,
                            use_best_model=True)

In [46]:
Kfolder.validate(user_train, user_test, features, lgbmodel, name="lgbfinal", prepare_stacking=True)

Fold  0 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.48819
[200]	valid_0's rmse: 1.48064
Early stopping, best iteration is:
[174]	valid_0's rmse: 1.48042
Fold  0  error:  1.480310716426908
Fold  0  score:  1.480310716426908


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  1 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.50038
[200]	valid_0's rmse: 1.49667
Early stopping, best iteration is:
[163]	valid_0's rmse: 1.496
Fold  1  error:  1.4958713907486783
Fold  1  score:  1.4958713907486783


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  2 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.49529
[200]	valid_0's rmse: 1.48929
Early stopping, best iteration is:
[177]	valid_0's rmse: 1.48919
Fold  2  error:  1.4891261936372522
Fold  2  score:  1.4891261936372522


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  3 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.47502
[200]	valid_0's rmse: 1.46709
Early stopping, best iteration is:
[218]	valid_0's rmse: 1.46658
Fold  3  error:  1.4664990586658535
Fold  3  score:  1.4664990586658535


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  4 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.50449
[200]	valid_0's rmse: 1.49846
Early stopping, best iteration is:
[194]	valid_0's rmse: 1.49834
Fold  4  error:  1.4982036156738903
Fold  4  score:  1.4982036156738903


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Final score:  1.4860021950305162


1.4860021950305162

In [47]:
xgbmodel = xgb.XGBRegressor(max_depth=22, learning_rate=0.02, n_estimators=1000, 
                                         objective='reg:linear', gamma=1.45, seed=2019, silent=False,
                                        subsample=0.67, colsample_bytree=0.054, colsample_bylevel=0.50)

In [48]:
Kfolder.validate(user_train, user_test, features, xgbmodel, name="xgbfinal", prepare_stacking=True)

Fold  0 :
[0]	validation_0-rmse:2.10535
Will train until validation_0-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:1.63463
[200]	validation_0-rmse:1.53977
[300]	validation_0-rmse:1.52037
[400]	validation_0-rmse:1.51347
[500]	validation_0-rmse:1.51045
Stopping. Best iteration:
[490]	validation_0-rmse:1.51039

Fold  0  error:  1.5101680021704056
Fold  0  score:  1.5101680021765678


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  1 :
[0]	validation_0-rmse:2.09722
Will train until validation_0-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:1.6337
[200]	validation_0-rmse:1.5435
[300]	validation_0-rmse:1.52263
[400]	validation_0-rmse:1.5159
[500]	validation_0-rmse:1.51543
Stopping. Best iteration:
[483]	validation_0-rmse:1.51513

Fold  1  error:  1.5151103898340765
Fold  1  score:  1.5151103898336606


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  2 :
[0]	validation_0-rmse:2.09992
Will train until validation_0-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:1.64067
[200]	validation_0-rmse:1.55025
[300]	validation_0-rmse:1.52394
[400]	validation_0-rmse:1.51702
[500]	validation_0-rmse:1.51514
[600]	validation_0-rmse:1.51473
Stopping. Best iteration:
[581]	validation_0-rmse:1.51452

Fold  2  error:  1.5142506676232634
Fold  2  score:  1.5142506676182685


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  3 :
[0]	validation_0-rmse:2.08557
Will train until validation_0-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:1.61664
[200]	validation_0-rmse:1.52899
[300]	validation_0-rmse:1.50112
[400]	validation_0-rmse:1.4942
[500]	validation_0-rmse:1.4922
Stopping. Best iteration:
[526]	validation_0-rmse:1.49153

Fold  3  error:  1.491862465218526
Fold  3  score:  1.4918624652160057


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  4 :
[0]	validation_0-rmse:2.10398
Will train until validation_0-rmse hasn't improved in 50 rounds.
[100]	validation_0-rmse:1.64835
[200]	validation_0-rmse:1.5443
[300]	validation_0-rmse:1.52585
[400]	validation_0-rmse:1.5218
[500]	validation_0-rmse:1.52182
Stopping. Best iteration:
[452]	validation_0-rmse:1.52125

Fold  4  error:  1.521489171013235
Fold  4  score:  1.5214891710149276


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Final score:  1.510576139171886


1.510576139171886

In [49]:
catmodel = cat.CatBoostRegressor(iterations=500, learning_rate=0.2, depth=5, random_seed=2019)

In [50]:
Kfolder.validate(user_train, user_test, features, catmodel, name="catfinal", prepare_stacking=True,
                fit_params={"use_best_model": True, "verbose": 100})

Fold  0 :
0:	learn: 1.9234026	test: 1.9240888	best: 1.9240888 (0)	total: 474ms	remaining: 3m 56s
100:	learn: 1.4530950	test: 1.4840698	best: 1.4840216 (65)	total: 32.4s	remaining: 2m 8s
200:	learn: 1.4238170	test: 1.4843915	best: 1.4830691 (140)	total: 1m 6s	remaining: 1m 39s
300:	learn: 1.4001687	test: 1.4845804	best: 1.4830691 (140)	total: 1m 43s	remaining: 1m 8s
400:	learn: 1.3785922	test: 1.4858635	best: 1.4830691 (140)	total: 2m 18s	remaining: 34.2s
499:	learn: 1.3582505	test: 1.4891388	best: 1.4830691 (140)	total: 2m 51s	remaining: 0us

bestTest = 1.483069089
bestIteration = 140

Shrink model to first 141 iterations.
Fold  0  error:  1.4830135938670719
Fold  0  score:  1.4830135938670719


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  1 :
0:	learn: 1.9227131	test: 1.9291017	best: 1.9291017 (0)	total: 327ms	remaining: 2m 43s
100:	learn: 1.4522075	test: 1.5002223	best: 1.4993716 (50)	total: 32.1s	remaining: 2m 6s
200:	learn: 1.4220141	test: 1.5037917	best: 1.4993716 (50)	total: 1m 5s	remaining: 1m 37s
300:	learn: 1.3962026	test: 1.5050084	best: 1.4993716 (50)	total: 1m 40s	remaining: 1m 6s
400:	learn: 1.3745959	test: 1.5080885	best: 1.4993716 (50)	total: 2m 15s	remaining: 33.4s
499:	learn: 1.3533751	test: 1.5106947	best: 1.4993716 (50)	total: 2m 49s	remaining: 0us

bestTest = 1.49937164
bestIteration = 50

Shrink model to first 51 iterations.
Fold  1  error:  1.4993489399793964
Fold  1  score:  1.4993489399793964


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  2 :
0:	learn: 1.9224173	test: 1.9273421	best: 1.9273421 (0)	total: 306ms	remaining: 2m 32s
100:	learn: 1.4533871	test: 1.4932324	best: 1.4930465 (77)	total: 33.3s	remaining: 2m 11s
200:	learn: 1.4230808	test: 1.4933615	best: 1.4916843 (140)	total: 1m 7s	remaining: 1m 40s
300:	learn: 1.3970827	test: 1.4944373	best: 1.4916843 (140)	total: 1m 41s	remaining: 1m 7s
400:	learn: 1.3757515	test: 1.4973363	best: 1.4916843 (140)	total: 2m 16s	remaining: 33.8s
499:	learn: 1.3549165	test: 1.4994177	best: 1.4916843 (140)	total: 2m 50s	remaining: 0us

bestTest = 1.491684288
bestIteration = 140

Shrink model to first 141 iterations.
Fold  2  error:  1.491614108510809
Fold  2  score:  1.491614108510809


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  3 :
0:	learn: 1.9270994	test: 1.9105590	best: 1.9105590 (0)	total: 304ms	remaining: 2m 31s
100:	learn: 1.4571177	test: 1.4682382	best: 1.4679803 (88)	total: 32.4s	remaining: 2m 7s
200:	learn: 1.4276607	test: 1.4712191	best: 1.4679803 (88)	total: 1m 6s	remaining: 1m 38s
300:	learn: 1.4014522	test: 1.4736624	best: 1.4679803 (88)	total: 1m 39s	remaining: 1m 5s
400:	learn: 1.3798644	test: 1.4753156	best: 1.4679803 (88)	total: 2m 14s	remaining: 33.2s
499:	learn: 1.3591382	test: 1.4784413	best: 1.4679803 (88)	total: 2m 48s	remaining: 0us

bestTest = 1.467980325
bestIteration = 88

Shrink model to first 89 iterations.
Fold  3  error:  1.4679458433462738
Fold  3  score:  1.4679458433462738


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Fold  4 :
0:	learn: 1.9213933	test: 1.9309091	best: 1.9309091 (0)	total: 302ms	remaining: 2m 30s
100:	learn: 1.4511080	test: 1.5008525	best: 1.4993232 (77)	total: 32.4s	remaining: 2m 8s
200:	learn: 1.4234752	test: 1.5045519	best: 1.4993232 (77)	total: 1m 5s	remaining: 1m 37s
300:	learn: 1.3996346	test: 1.5054898	best: 1.4993232 (77)	total: 1m 39s	remaining: 1m 5s
400:	learn: 1.3782761	test: 1.5082235	best: 1.4993232 (77)	total: 2m 16s	remaining: 33.7s
499:	learn: 1.3575206	test: 1.5114967	best: 1.4993232 (77)	total: 2m 49s	remaining: 0us

bestTest = 1.499323211
bestIteration = 77

Shrink model to first 78 iterations.
Fold  4  error:  1.4992882578623832
Fold  4  score:  1.4992882578623832


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Final score:  1.4882421487131867


1.4882421487131867

In [51]:
user_train['PredictedLogRevenue'] = 0.4 * user_train["lgbfinal"] + \
                                    0.2 * user_train["xgbfinal"] + \
                                    0.4 * user_train["catfinal"]
score(user_train, user_train.PredictedLogRevenue)

Defaulting to column, but this will raise an ambiguity error in a future version
  


1.4846617492696768

In [52]:
user_test['PredictedLogRevenue'] = 0.4 * user_test["lgbfinal"] +  0.4 * user_test["catfinal"] + 0.2 * user_test["xgbfinal"]
user_test[['PredictedLogRevenue']].to_csv('submission.csv', index=True)

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

train.visitStartTime = pd.to_datetime(train.visitStartTime, unit='s')
test.visitStartTime = pd.to_datetime(test.visitStartTime, unit='s')
train["date"] = train.visitStartTime
test["date"] = test.visitStartTime

train['hits_by_pageviews'] = train['totals.hits'].astype(float) / train['totals.pageviews'].astype(float)
test['hits_by_pageviews'] = test['totals.hits'].astype(float) / test['totals.pageviews'].astype(float)

for df in [train, test]:
    df['weekday'] = df['date'].dt.dayofweek.astype(object)
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.date
    
train["totals.transactionRevenue"].fillna(0, inplace=True)

df = pd.concat([train, test])
df.sort_values(['fullVisitorId', 'date'], ascending=True, inplace=True)
df['prev_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(1)).astype(np.int64) 
df['next_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(-1)).astype(np.int64) 
df.sort_index(inplace=True)

train = df[:len(train)]
test = df[len(train):]

for df in [train, test]:
    df['source.country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    df['campaign.medium'] = df['trafficSource.campaign'] + '_' + df['trafficSource.medium']
    df['browser.category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser.os'] = df['device.browser'] + '_' + df['device.operatingSystem']
    
for df in [train, test]:
    df['device_deviceCategory_channelGrouping'] = df['device.deviceCategory'] + "_" + df['channelGrouping']
    df['channelGrouping_browser'] = df['device.browser'] + "_" + df['channelGrouping']
    df['channelGrouping_OS'] = df['device.operatingSystem'] + "_" + df['channelGrouping']
    
    for i in ['geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country','geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region','geoNetwork.subContinent']:
        for j in ['device.browser','device.deviceCategory', 'device.operatingSystem', 'trafficSource.source']:
            df[i + "_" + j] = df[i] + "_" + df[j]
    
    df['content.source'] = df['trafficSource.adContent'].astype(str) + "_" + df['source.country']
    df['medium.source'] = df['trafficSource.medium'] + "_" + df['source.country']

In [None]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime', 'month', 'day', 'hits_by_pageviews']

cat_cols = [f for f in train.columns if (train[f].dtype == 'object' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))
    
for col in real_cols:
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

In [None]:
for to_del in ["date", "sessionId", "visitId", "day"]:
    del train[to_del]
    del test[to_del]

In [None]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime', 'month', 'hits_by_pageviews']

cat_cols = [f for f in train.columns if (train[f].dtype == 'int64' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

In [None]:
from sklearn.metrics import mean_squared_error
def score(data, y):
    validation_res = pd.DataFrame(
    {"fullVisitorId": data["fullVisitorId"].values,
     "transactionRevenue": data["totals.transactionRevenue"].values,
     "predictedRevenue": np.expm1(y)})

    validation_res = validation_res.groupby("fullVisitorId")["transactionRevenue", "predictedRevenue"].sum().reset_index()
    return np.sqrt(mean_squared_error(np.log1p(validation_res["transactionRevenue"].values), 
                                     np.log1p(validation_res["predictedRevenue"].values)))

In [None]:
from sklearn.model_selection import GroupKFold

class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['fullVisitorId'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["totals.transactionRevenue"].iloc[trn])
            valid = train[features].iloc[val]
            y_valid = np.log1p(train["totals.transactionRevenue"].iloc[val])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(train.iloc[val], predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final score: ", full_score)
        return full_score

In [None]:
Kfolder = KFoldValidation(train)

In [None]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9, use_best_model=True)

In [None]:
Kfolder.validate(train, test, real_cols + cat_cols, lgbmodel, "lgbpred", prepare_stacking=True)

In [None]:
def create_user_df(df):
    agg_data = df[real_cols + cat_cols + ['fullVisitorId']].groupby('fullVisitorId').mean()
    
    pred_list = df[['fullVisitorId', 'lgbpred']].groupby('fullVisitorId').apply(lambda visitor_df: list(visitor_df.lgbpred))\
        .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})
    all_predictions = pd.DataFrame(list(pred_list.values), index=agg_data.index)
    feats = all_predictions.columns

    all_predictions['t_mean'] = all_predictions.mean(axis=1)
    all_predictions['t_median'] = all_predictions.median(axis=1)  
    all_predictions['t_sum_log'] = all_predictions.sum(axis=1)
    all_predictions['t_sum_act'] = all_predictions.fillna(0).sum(axis=1)
    all_predictions['t_nb_sess'] = all_predictions.isnull().sum(axis=1)

    full_data = pd.concat([agg_data, all_predictions], axis=1).astype(float)
    full_data['fullVisitorId'] = full_data.index
    del agg_data, all_predictions
    gc.collect()
    return full_data

In [None]:
user_train = create_user_df(train)
user_test = create_user_df(test)

In [None]:
features = list(user_train.columns)[:-1]  # don't include "fullVisitorId"
user_train["totals.transactionRevenue"] = train[['fullVisitorId', 'totals.transactionRevenue']].groupby('fullVisitorId').sum()

In [None]:
for f in features:
    if f not in user_test.columns:
        user_test[f] = np.nan

In [None]:
Kfolder = KFoldValidation(user_train)

In [None]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9,
                            use_best_model=True)

In [None]:
Kfolder.validate(user_train, user_test, features, lgbmodel, name="lgbfinal", prepare_stacking=True)

In [None]:
xgbmodel = xgb.XGBRegressor(max_depth=22, learning_rate=0.02, n_estimators=1000, 
                                         objective='reg:linear', gamma=1.45, seed=2019, silent=False,
                                        subsample=0.67, colsample_bytree=0.054, colsample_bylevel=0.50)

In [None]:
Kfolder.validate(user_train, user_test, features, xgbmodel, name="xgbfinal", prepare_stacking=True)

In [None]:
catmodel = cat.CatBoostRegressor(iterations=500, learning_rate=0.2, depth=5, random_seed=2019)

In [None]:
Kfolder.validate(user_train, user_test, features, catmodel, name="catfinal", prepare_stacking=True,
                fit_params={"use_best_model": True, "verbose": 100})

In [None]:
user_train['PredictedLogRevenue'] = 0.4 * user_train["lgbfinal"] + \
                                    0.2 * user_train["xgbfinal"] + \
                                    0.4 * user_train["catfinal"]
score(user_train, user_train.PredictedLogRevenue)

In [None]:
user_test['PredictedLogRevenue'] = 0.4 * user_test["lgbfinal"] +  0.4 * user_test["catfinal"] + 0.2 * user_test["xgbfinal"]

In [None]:
user_test[['PredictedLogRevenue']].to_csv('leaky submission.csv', index=True)

In [None]:
user_test.shape