In [1]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
import time
from datetime import datetime
import gc
import psutil
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv("input/flattened/train.csv", low_memory=False)
test = pd.read_csv("input/flattened/test.csv", low_memory=False)

In [3]:
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['Null','Unique_Count','Data_type','Sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
         df.at[col,'Sample_values']=list(df_fa[col].unique())
           
    return(df.fillna('-'))

In [4]:
to_drop = [ "socialEngagementType", 
          "device.browserSize",
          "device.browserVersion",
          "device.flashVersion",
          "device.language",
          "device.mobileDeviceBranding",
          "device.mobileDeviceInfo",
          "device.mobileDeviceMarketingName",
          "device.mobileDeviceModel",
          "device.mobileInputSelector",
          "device.operatingSystemVersion",
          "device.screenColors",
          "device.screenResolution",
          "geoNetwork.cityId",
          "geoNetwork.latitude",
          "geoNetwork.longitude",
          "geoNetwork.networkLocation",
          "totals.visits",
          "trafficSource.adwordsClickInfo.criteriaParameters",
          ]
train.drop(to_drop, axis=1, inplace=True)
train.drop("trafficSource.campaignCode", axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)

In [5]:
train.visitStartTime = pd.to_datetime(train.visitStartTime, unit='s')
test.visitStartTime = pd.to_datetime(test.visitStartTime, unit='s')
train["date"] = train.visitStartTime
test["date"] = test.visitStartTime

In [6]:
train.set_index("visitStartTime", inplace=True)
test.set_index("visitStartTime", inplace=True)
train.sort_index(inplace=True)
test.sort_index(inplace=True)

In [7]:
for col in ['trafficSource.keyword',
            'trafficSource.referralPath',
            'trafficSource.adwordsClickInfo.gclId',
            'trafficSource.adwordsClickInfo.adNetworkType',
            'trafficSource.adwordsClickInfo.page',
            'trafficSource.adwordsClickInfo.slot',
            'trafficSource.adContent']:
    train[col].fillna('unknown', inplace=True)
    test[col].fillna('unknown', inplace=True)

train['totals.pageviews'].fillna(1, inplace=True)
train['totals.newVisits'].fillna(0, inplace=True)
train['totals.bounces'].fillna(0, inplace=True)
train["totals.transactionRevenue"].fillna(0.0, inplace=True)
train['totals.pageviews'] = train['totals.pageviews'].astype(int)
train['totals.newVisits'] = train['totals.newVisits'].astype(int)
train['totals.bounces'] = train['totals.bounces'].astype(int)
train['trafficSource.adwordsClickInfo.isVideoAd'].fillna('True', inplace=True)
train['trafficSource.isTrueDirect'].fillna(False, inplace=True)

test['totals.pageviews'].fillna(1, inplace=True)
test['totals.newVisits'].fillna(0, inplace=True)
test['totals.bounces'].fillna(0, inplace=True)
test['totals.pageviews'] = test['totals.pageviews'].astype(int)
test['totals.newVisits'] = test['totals.newVisits'].astype(int)
test['trafficSource.adwordsClickInfo.isVideoAd'].fillna('True', inplace=True)
test['totals.bounces'] = test['totals.bounces'].astype(int)

test['trafficSource.isTrueDirect'].fillna(False, inplace=True)

In [8]:
feature_summary(train)

DataFrame shape
rows: 903653
cols: 34


Unnamed: 0,Null,Unique_Count,Data_type,Sample_values
channelGrouping,0,8,object,"[Direct, Social, Organic Search, Referral, Dis..."
date,0,887159,datetime64[ns],"[2016-08-01T07:00:12.000000000, 2016-08-01T07:..."
fullVisitorId,0,714167,object,"[0423043652415339154, 8294721032567046680, 771..."
sessionId,0,902755,object,"[0423043652415339154_1470034812, 8294721032567..."
visitId,0,886303,int64,"[1470034812, 1470035066, 1470035081, 147003516..."
visitNumber,0,384,int64,"[3, 1, 5, 2, 7, 6, 96, 16, 12, 4, 9, 22, 10, 2..."
device.browser,0,54,object,"[Safari, Chrome, Amazon Silk, Firefox, Interne..."
device.deviceCategory,0,3,object,"[mobile, desktop, tablet]"
device.isMobile,0,2,bool,"[True, False]"
device.operatingSystem,0,20,object,"[iOS, Windows, Android, Macintosh, Linux, Chro..."


In [9]:
train.loc[(train['device.browser'] == 'Edge'), 'device.browser'] = 'Internet Explorer' 
train.loc[(train['device.browser'] == 'Safari (in-app)'), 'device.browser'] = 'Safari'
train.loc[(train['device.browser'] == 'Android Browser'), 'device.browser'] = 'Android Webview'
train.loc[(train['device.browser'] == 'Opera Mini'), 'device.browser'] = 'Opera'

test.loc[(test['device.browser'] == 'Edge'), 'device.browser'] = 'Internet Explorer' 
test.loc[(test['device.browser'] == 'Safari (in-app)'), 'device.browser'] = 'Safari'
test.loc[(test['device.browser'] == 'Android Browser'), 'device.browser'] = 'Android Webview'
test.loc[(test['device.browser'] == 'Samsung Internet'), 'device.browser'] = 'Android Webview'
test.loc[(test['device.browser'] == 'Opera Mini'), 'device.browser'] = 'Opera'

In [10]:
col = ['Chrome', 'Safari', 'Firefox', 'Android Webview', 'Internet Explorer', 'Opera', 'UC Browser', 'YaBrowser', 'Amazon Silk', 'Coc Coc']

train.loc[train['device.browser'].map(lambda x: x not in col), 'device.browser'] = 'Other'
test.loc[test['device.browser'].map(lambda x: x not in col), 'device.browser'] = 'Other'

In [11]:
col = ['Windows', 'Macintosh', 'Chrome OS', 'Android', 'iOS', 'Linux', 'Samsung', 'Windows Phone', 'Tizen', 'BlackBerry']

train.loc[train['device.operatingSystem'].map(lambda x: x not in col), 'device.operatingSystem'] = 'Other'
test.loc[test['device.operatingSystem'].map(lambda x: x not in col), 'device.operatingSystem'] = 'Other'

In [12]:
vc = pd.concat([train['geoNetwork.city'], test['geoNetwork.city']], sort=False).value_counts()
common = vc > 1000
common = set(common.index[common].values)  
train.loc[train['geoNetwork.city'].map(lambda x: x not in common), 'geoNetwork.city'] = 'Other'
test.loc[test['geoNetwork.city'].map(lambda x: x not in common), 'geoNetwork.city'] = 'Other'

In [13]:
def clearRare(columnname, limit = 1000):
    vc = pd.concat([train[columnname], test[columnname]], sort=False).value_counts()
    common = vc > limit
    common = set(common.index[common].values)
    print("Set", sum(vc <= limit), columnname, "categories to 'other';", end=" ")
    
    train.loc[train[columnname].map(lambda x: x not in common), columnname] = 'Other'
    test.loc[test[columnname].map(lambda x: x not in common), columnname] = 'Other'
    print("now there are", train[columnname].nunique(), "categories in train")

In [14]:
clearRare("geoNetwork.country")
clearRare("geoNetwork.metro")
clearRare("geoNetwork.networkDomain")
clearRare("geoNetwork.region")
clearRare("geoNetwork.subContinent")
clearRare("trafficSource.adContent")
clearRare("trafficSource.campaign")
clearRare("trafficSource.keyword")
clearRare("trafficSource.medium")
clearRare("trafficSource.referralPath")
clearRare("trafficSource.source")

Set 147 geoNetwork.country categories to 'other'; now there are 82 categories in train
Set 97 geoNetwork.metro categories to 'other'; now there are 27 categories in train
Set 41850 geoNetwork.networkDomain categories to 'other'; now there are 133 categories in train
Set 391 geoNetwork.region categories to 'other'; now there are 92 categories in train
Set 4 geoNetwork.subContinent categories to 'other'; now there are 20 categories in train
Set 70 trafficSource.adContent categories to 'other'; now there are 5 categories in train
Set 27 trafficSource.campaign categories to 'other'; now there are 5 categories in train
Set 5382 trafficSource.keyword categories to 'other'; now there are 12 categories in train
Set 1 trafficSource.medium categories to 'other'; now there are 7 categories in train
Set 3144 trafficSource.referralPath categories to 'other'; now there are 45 categories in train
Set 473 trafficSource.source categories to 'other'; now there are 28 categories in train


In [15]:
for df in [train, test]:
    df['source_country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    df['campaign_medium'] = df['trafficSource.campaign'] + '_' + df['trafficSource.medium']
    df['browser_category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser_os'] = df['device.browser'] + '_' + df['device.operatingSystem']
    df['region_category'] = df['geoNetwork.region'] + '_' + df['device.deviceCategory']
    df['metro_category'] = df['geoNetwork.metro'] + '_' + df['device.deviceCategory']
    df['country_category'] = df['geoNetwork.country'] + '_' + df['device.deviceCategory']
    df['subContinent_operatingSystem'] = df['geoNetwork.subContinent'] + '_' + df['device.operatingSystem']

In [16]:
for df in [train, test]:
    df['continent_unique_user_count'] = df.groupby('geoNetwork.continent')['fullVisitorId'].transform('nunique')
    df['country_unique_user_count'] = df.groupby('geoNetwork.country')['fullVisitorId'].transform('nunique')
    df['subcontinent_unique_user_count'] = df.groupby('geoNetwork.subContinent')['fullVisitorId'].transform('nunique')

In [17]:
for feature in ["totals.hits", "totals.pageviews"]:
    info = pd.concat([train, test], sort=False).groupby("fullVisitorId")[feature].mean()
    train["usermean_" + feature] = train.fullVisitorId.map(info)
    test["usermean_" + feature] = test.fullVisitorId.map(info)
    
for feature in ["visitNumber"]:
    info = pd.concat([train, test], sort=False).groupby("fullVisitorId")[feature].max()
    train["usermax_" + feature] = train.fullVisitorId.map(info)
    test["usermax_" + feature] = test.fullVisitorId.map(info)

In [18]:
for df in [train, test]:
    df["id_incoherence"] = pd.to_datetime(df.visitId, unit='s') != df.date
    df["visitId_dublicates"] = df.visitId.map(df.visitId.value_counts())
    df["session_dublicates"] = df.sessionId.map(df.sessionId.value_counts())

In [19]:
for df in [train, test]:
    df['weekday'] = df['date'].dt.dayofweek.astype(object)
    df['time'] = df['date'].dt.second + df['date'].dt.minute*60 + df['date'].dt.hour*3600

In [20]:
df = pd.concat([train, test])
df.sort_values(['fullVisitorId', 'date'], ascending=True, inplace=True)
df['prev_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(1)).astype(np.int64) // 1e9 // 60 // 60
df['next_session'] = (df['date'] - df[['fullVisitorId', 'date']].groupby('fullVisitorId')['date'].shift(-1)).astype(np.int64) // 1e9 // 60 // 60
df.sort_index(inplace=True)

train = df[:len(train)]
test = df[len(train):]


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [28]:
train[['source_country', 'campaign_medium', 'browser_category', 'browser_os', 'region_category', 'metro_category', 'continent_unique_user_count', 'country_unique_user_count']].head(10)

Unnamed: 0_level_0,source_country,campaign_medium,browser_category,browser_os,region_category,metro_category,continent_unique_user_count,country_unique_user_count
visitStartTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-08-01 07:00:12,(direct)_United States,(not set)_(none),Safari_mobile,Safari_iOS,not available in demo dataset_mobile,not available in demo dataset_mobile,323208,251830
2016-08-01 07:04:26,youtube.com_Thailand,(not set)_referral,Chrome_desktop,Chrome_Windows,not available in demo dataset_desktop,not available in demo dataset_desktop,196416,18792
2016-08-01 07:04:41,google_United States,(not set)_organic,Amazon Silk_tablet,Amazon Silk_Android,not available in demo dataset_tablet,not available in demo dataset_tablet,323208,251830
2016-08-01 07:06:01,google_Canada,(not set)_organic,Chrome_desktop,Chrome_Windows,not available in demo dataset_desktop,not available in demo dataset_desktop,323208,19356
2016-08-01 07:06:10,(direct)_Philippines,(not set)_(none),Chrome_desktop,Chrome_Windows,Metro Manila_desktop,(not set)_desktop,196416,8153
2016-08-01 07:08:12,(direct)_United States,(not set)_(none),Safari_mobile,Safari_iOS,California_mobile,San Francisco-Oakland-San Jose CA_mobile,323208,251830
2016-08-01 07:10:29,(direct)_United States,(not set)_(none),Safari_tablet,Safari_iOS,California_tablet,San Francisco-Oakland-San Jose CA_tablet,323208,251830
2016-08-01 07:10:57,google_United States,(not set)_organic,Chrome_mobile,Chrome_Android,California_mobile,San Francisco-Oakland-San Jose CA_mobile,323208,251830
2016-08-01 07:11:41,youtube.com_United States,(not set)_referral,Chrome_desktop,Chrome_Windows,not available in demo dataset_desktop,not available in demo dataset_desktop,323208,251830
2016-08-01 07:12:01,(direct)_United Kingdom,(not set)_(none),Safari_mobile,Safari_iOS,not available in demo dataset_mobile,not available in demo dataset_mobile,167966,31653


In [253]:
test["totals.transactionRevenue"].fillna(0.0, inplace=True)

In [255]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime']

cat_cols = [f for f in train.columns if (train[f].dtype == 'object' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

In [256]:
total = len(cat_cols)
cur = 0
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))
    cur += 1
    print("total/cur:%d/%d" % (total,cur))
print('done')

LabelEncoder()

total/cur:27/1


LabelEncoder()

total/cur:27/2


LabelEncoder()

total/cur:27/3


LabelEncoder()

total/cur:27/4


LabelEncoder()

total/cur:27/5


LabelEncoder()

total/cur:27/6


LabelEncoder()

total/cur:27/7


LabelEncoder()

total/cur:27/8


LabelEncoder()

total/cur:27/9


LabelEncoder()

total/cur:27/10


LabelEncoder()

total/cur:27/11


LabelEncoder()

total/cur:27/12


LabelEncoder()

total/cur:27/13


LabelEncoder()

total/cur:27/14


LabelEncoder()

total/cur:27/15


LabelEncoder()

total/cur:27/16


LabelEncoder()

total/cur:27/17


LabelEncoder()

total/cur:27/18


LabelEncoder()

total/cur:27/19


LabelEncoder()

total/cur:27/20


LabelEncoder()

total/cur:27/21


LabelEncoder()

total/cur:27/22


LabelEncoder()

total/cur:27/23


LabelEncoder()

total/cur:27/24


LabelEncoder()

total/cur:27/25


LabelEncoder()

total/cur:27/26


LabelEncoder()

total/cur:27/27
done


In [257]:
for col in real_cols:
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

In [258]:
for to_del in ["sessionId", "visitId"]:
    del train[to_del]
    del test[to_del]

In [260]:
excluded = ['date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId', 'visitStartTime']

cat_cols = [f for f in train.columns if (train[f].dtype == 'int64' and f not in excluded)]
real_cols = [f for f in train.columns if (not f in cat_cols and f not in excluded)]

In [261]:
from sklearn.metrics import mean_squared_error
def score(data, y):
    validation_res = pd.DataFrame(
    {"fullVisitorId": data["fullVisitorId"].values,
     "transactionRevenue": data["totals.transactionRevenue"].values,
     "predictedRevenue": np.expm1(y)})

    validation_res = validation_res.groupby("fullVisitorId")["transactionRevenue", "predictedRevenue"].sum().reset_index()
    return np.sqrt(mean_squared_error(np.log1p(validation_res["transactionRevenue"].values), 
                                     np.log1p(validation_res["predictedRevenue"].values)))

In [262]:
from sklearn.model_selection import GroupKFold

class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['fullVisitorId'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["totals.transactionRevenue"].iloc[trn])
            valid = train[features].iloc[val]
            y_valid = np.log1p(train["totals.transactionRevenue"].iloc[val])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(train.iloc[val], predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final score: ", full_score)
        return full_score

In [263]:
Kfolder = KFoldValidation(train)

In [None]:
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.66, colsample_bytree=.66)

In [None]:
Kfolder.validate(train, test, real_cols + cat_cols, lgbmodel, "lgbpred", prepare_stacking=True)

In [None]:
def create_user_df(df):
    agg_data = df[real_cols + cat_cols + ['fullVisitorId']].groupby('fullVisitorId').mean()
    
    pred_list = df[['fullVisitorId', 'lgbpred']].groupby('fullVisitorId').apply(lambda visitor_df: list(visitor_df.lgbpred))\
        .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})
    all_predictions = pd.DataFrame(list(pred_list.values), index=agg_data.index)
    feats = all_predictions.columns

    all_predictions['t_mean'] = all_predictions.mean(axis=1)
    all_predictions['t_median'] = all_predictions.median(axis=1)   # including t_mean as one of the elements? well, ok
    all_predictions['t_sum_log'] = all_predictions.sum(axis=1)
    all_predictions['t_sum_act'] = all_predictions.fillna(0).sum(axis=1)
    all_predictions['t_nb_sess'] = all_predictions.isnull().sum(axis=1)

    full_data = pd.concat([agg_data, all_predictions], axis=1).astype(float)
    full_data['fullVisitorId'] = full_data.index
    del agg_data, all_predictions
    gc.collect()
    return full_data

In [None]:
user_train = create_user_df(train)
user_test = create_user_df(test)

In [None]:
features = list(user_train.columns)[:-1]
user_train["totals.transactionRevenue"] = train[['fullVisitorId', 'totals.transactionRevenue']].groupby('fullVisitorId').sum()

In [None]:
for f in features:
    if f not in user_test.columns:
        user_test[f] = np.nan

In [None]:
Kfolder = KFoldValidation(user_train)
lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
                      learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9,
                             )
Kfolder.validate(user_train, user_test, features, lgbmodel, name="lgbfinal", prepare_stacking=True)

xgbmodel = xgb.XGBRegressor(max_depth=10, learning_rate=0.05, n_estimators=600, 
                                         objective='reg:linear', gamma=1.45, seed=2019, silent=False,
                                        subsample=0.67, colsample_bytree=0.54, colsample_bylevel=0.50)
Kfolder.validate(user_train, user_test, features, xgbmodel, name="xgbfinal", prepare_stacking=True)

catmodel = cat.CatBoostRegressor(iterations=500, learning_rate=0.2, depth=5, random_seed=2019)
Kfolder.validate(user_train, user_test, features, catmodel, name="catfinal", prepare_stacking=True,
                fit_params={"use_best_model": True, "verbose": 100})

catmodel2 = cat.CatBoostRegressor(iterations=600, learning_rate=0.04, depth=7, random_seed=2019)
Kfolder.validate(user_train, user_test, features, catmodel, name="catfinal2", prepare_stacking=True,
                fit_params={"use_best_model": True, "verbose": 100})

gc.collect()

In [None]:
user_train['PredictedLogRevenue'] = 0.3 * user_train["lgbfinal"] + \
                                    0.3 * user_train["xgbfinal"] + \
                                    0.2 * user_train["catfinal"] + \
                                    0.2 * user_train['catfinal2']
score(user_train, user_train.PredictedLogRevenue)

In [None]:
user_test['PredictedLogRevenue'] = 0.3 * user_test["lgbfinal"] +  0.2 * user_test["catfinal"] + 0.2 * user_test["catfinal2"] + 0.3 * user_test["xgbfinal"]
user_test[['PredictedLogRevenue']].to_csv('submission.csv', index=True)