In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import random
from random import randrange
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

In [2]:


def column_type(dataFrame):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:[] for object_type in type_keys }
    
    for column in dataFrame.columns:
        Dict[dataFrame[column].dtypes].append(column)
    
    return Dict


def column_counts(dataFrame, threshold):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:{} for object_type in type_keys}
    
    for column in dataFrame.columns:
        
        if len(dataFrame[column].unique()) <= threshold:
            
            Dict[dataFrame[column].dtype][column] = len(dataFrame[column].unique())
            
    return Dict


def value_count_preprocess(dataFrame,column, remove = False):
    
    value_count_df = dataFrame[column].value_counts()
    value_count_df_reshaped = value_count_df.rename_axis(column).reset_index(name = column+'_counts')
    dataFrame = pd.merge(dataFrame, value_count_df_reshaped, on = column, how = 'left')
    
    if remove:
        
        
        dataFrame.drop(columns=[column],inplace=True)
    return dataFrame


def get_dummies_preprocess(dataFrame, column, remove = False):
    dataFrame_encoded = pd.get_dummies(dataFrame[column],drop_first = True)
    dataFrame = pd.concat([dataFrame,dataFrame_encoded],axis = 1)
    if remove:
        dataFrame.drop(columns = [column],inplace=True)
    return dataFrame


def date_time_conversion(dataFrame, column, year = 0, month = 0, day = 0, weekday = 0, unit = None ,errors = None , format = None):
    
    dataFrame[column] = pd.to_datetime(dataFrame[column], format = format, errors = errors, unit = unit)
    
    if year: 
        dataFrame[column+'_year'] = dataFrame[column].apply(lambda x: x.year)
    
    if month:
        dataFrame[column+'_month'] = dataFrame[column].apply(lambda x: x.month)
        
    if day:
        dataFrame[column+'_day'] = dataFrame[column].apply(lambda x: x.day)
        
    if weekday:
        dataFrame[column+'_weekday'] = dataFrame[column].apply(lambda x: x.weekday())
    
    return dataFrame



def random_under_sample(dataFrame, column, sample_size):
    
    dataFrame_0 = dataFrame[dataFrame[column]==0]
    
    dataFrame_1 = dataFrame[dataFrame[column]==1]
    
    dataFrame_0_index_list = list(dataFrame_0.index.values)
    
    dataFrame_0_rand_index = random.sample(dataFrame_0_index_list,sample_size)
    
    dataFrame_0_rand_index = list(set(dataFrame_0_rand_index))
    
    dataFrame_0_rus = dataFrame_0.loc[dataFrame_0_rand_index,:]
    
    frames = [dataFrame_1,dataFrame_0_rus]
    
    return pd.concat(frames)




def five_fold_generator(array_length):
    
    repetitions = int(array_length/5)
    folds = []
    
    for i in range(repetitions):
        folds.append(1)
        folds.append(2)
        folds.append(3)
        folds.append(4)
        folds.append(5)
    for i in range(array_length-5*repetitions):
        folds.append(1)
    
    random_fold = np.random.permutation(folds)
    
    return random_fold










def prediction_df(dataFrame,column_id,scores,predicted_scores):
    
    df_pred = pd.DataFrame({column_id:dataFrame[column_id].values})
    df_pred[scores] = dataFrame[scores]
    df_pred['prediction']=np.expm1(predicted_scores)
    
    df_pred = df_pred.groupby(column_id)[scores,'prediction'].sum().reset_index()
    
    df_pred['log_'+scores] = np.log1p(df_pred[scores].values)
    
    df_pred['log_prediction'] = np.log1p(df_pred['prediction'].values)
    
    return df_pred
        
    
    

### Functions for out of sample model stacking

In [147]:
def out_of_sample_pred_classifier(dataFrame,model,predictor,sample_size=None):
    
    features = [column for column in dataFrame.columns 
                if (column != 'is_transaction' and column != 'totals.transactionRevenue' 
                    and column != 'transactionRevenue_log' and column != 'folds')]
    
    predictions = []
    
    
    
    for fold in [1,2,3,4,5]:
        
        train_fold = dataFrame[~(dataFrame['folds']==fold)]
        
        test_fold = dataFrame[(dataFrame['folds']==fold)] 
        
        if sample_size:
            
            train_fold = random_under_sample(train_fold,'is_transaction',sample_size)
        
        X_fold = train_fold[features]
        y_fold = train_fold[predictor]
        
        X_test_fold = test_fold[features]
        
        model.fit(X_fold,y_fold)
        
        y_pred = model.predict_proba(X_test_fold)
        
        predictions = predictions+list(y_pred)
        
    predictions = [A[0] for A in predictions]
    
    return np.array(predictions),model

In [148]:
def out_of_sample_pred_regressor(dataFrame,model,predictor,sample_size=None):
    
    features = [column for column in dataFrame.columns 
                if (column != 'is_transaction' and column != 'totals.transactionRevenue' 
                    and column != 'transactionRevenue_log' and column !='folds')]
    
    predictions = []
    
    
    
    for fold in [1,2,3,4,5]:
        
        train_fold = dataFrame[~(dataFrame['folds']==fold)]
        
        test_fold = dataFrame[(dataFrame['folds']==fold)] 
        
        if sample_size:
            
            train_fold = random_under_sample(train_fold,'is_transaction',sample_size)
        
        X_fold = train_fold[features]
        y_fold = train_fold[predictor]
        
        X_test_fold = test_fold[features]
        
        model.fit(X_fold,y_fold)
        
        y_pred = model.predict(X_test_fold)
        
        predictions = predictions+list(y_pred)
        
    
    
    return np.array(predictions),model

In [157]:
def get_predict_proba(dataFrame,model):
    
    predictions = model.predict_proba(dataFrame)
    
    predictions = [A[0] for A in predictions]
    
    return np.array(predictions)
    
    

In [32]:
def true_predictor_values(dataFrame,predictor):
    
    vals = []
    
    for fold in [1,2,3,4,5]:
        
        train_fold = train[~(train['folds']==fold)]
        
        test_fold = train[(train['folds']==fold)]
        
        y = test_fold[predictor]
        
        vals = vals+ list(y)
        
    return np.array(vals)

### Functions for testing purposes only

In [144]:
def out_of_sample_val_test(dataFrame,predictor):
    
    vals = []
    
    for fold in [1,2,3,4,5]:
        
        train_fold = train[~(train['folds']==fold)]
        
        test_fold = train[(train['folds']==fold)]
        
        y = test_fold[predictor]
        
        vals = vals+ list(y)
        
    return np.array(vals)
        
        
        
        

In [5]:
def out_of_sample_pred_test(dataFrame,model,predictor,sample_size=None):
    
    features = [column for column in dataFrame.columns 
                if (column != 'is_transaction' and column != 'totals.transactionRevenue' 
                    and column != 'transactionRevenue_log')]
    
    predictions = []
    
    scores = []
    
    
    
    for fold in [1,2,3,4,5]:
        
        print('fold no '+ str(fold))
        
        train_fold = dataFrame[~(dataFrame['folds']==fold)]
        
        test_fold = dataFrame[(dataFrame['folds']==fold)] 
        
        print('train and test created')
        
        if sample_size:
            
            train_fold = random_under_sample(train_fold,'is_transaction',sample_size)
        
        X_fold = train_fold[features]
        y_fold = train_fold[predictor]
        
        print('arrays created')
        
        X_test_fold = test_fold[features]
        
        print('model fitting')
        
        model.fit(X_fold,y_fold)
        
        print('finished model fit')
        
        y_pred = model.predict(X_test_fold)
        
        y_true = test_fold[predictor]
        
        predictions = predictions+list(y_pred)
        
        scores = scores +list(y_true)
        
        print('------------------')
    
    return np.array(predictions),np.array(scores)

### required 

In [6]:
train_master = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/train_flattened.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
val_master = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/val.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
test_master = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/test_flattened.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### analysis

In [10]:
train_master.head()

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,...,,,,(not set),,,(not provided),organic,,google
1,1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,...,,,,(not set),,,(not provided),organic,,google
2,2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,...,,,,(not set),,,(not provided),organic,,google
3,3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,...,,,,(not set),,,google + online,organic,,google
4,4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,...,,,,(not set),,True,(not provided),organic,,google


In [11]:
test_master.head()

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,...,,,,,(not set),True,(not provided),organic,,google
1,1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,...,,,,,(not set),,(not provided),organic,,google
2,2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,...,,,,,(not set),,(not provided),organic,,google
3,3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,...,,,,,(not set),,(not provided),organic,,google
4,4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,...,,,,,(not set),,(not provided),organic,,google


In [12]:
val_master.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,...,trafficSource.referralPath,trafficSource.source,date_year,date_month,date_day,date_weekday,visitStartTime_year,visitStartTime_month,visitStartTime_day,visitStartTime_weekday
0,317493,317493,Referral,2017-03-31 00:00:00,7734592676774791491,7734592676774791491_1490975679,Not Socially Engaged,1490975679,2,2017-03-31 15:54:39,...,/a/google.com/googletopia/discounts-deals-and-...,sites.google.com,2017,3,31,4,2017,3,31,4
1,317492,317492,Direct,2017-03-31 00:00:00,3805349545303466877,3805349545303466877_1491024939,Not Socially Engaged,1491024939,1,2017-04-01 05:35:39,...,,(direct),2017,3,31,4,2017,4,1,5
2,317491,317491,Direct,2017-03-31 00:00:00,4762257881620848154,4762257881620848154_1490987424,Not Socially Engaged,1490987424,1,2017-03-31 19:10:24,...,,(direct),2017,3,31,4,2017,3,31,4
3,317490,317490,Referral,2017-03-31 00:00:00,5110272178647833680,5110272178647833680_1490983609,Not Socially Engaged,1490983609,1,2017-03-31 18:06:49,...,/analytics/web/,analytics.google.com,2017,3,31,4,2017,3,31,4
4,317489,317489,Referral,2017-03-31 00:00:00,900160777584512213,0900160777584512213_1491023872,Not Socially Engaged,1491023872,1,2017-04-01 05:17:52,...,/permissions/using-the-logo.html,google.co.jp,2017,3,31,4,2017,4,1,5


## Preprocessing date and time features

### required

In [9]:
train_master["totals.transactionRevenue"].fillna(0, inplace=True)
val_master["totals.transactionRevenue"].fillna(0, inplace = True)

In [10]:
date_time_conversion(train_master, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(train_master,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.referralPath,trafficSource.source,date_year,date_month,date_day,date_weekday,visitStartTime_year,visitStartTime_month,visitStartTime_day,visitStartTime_weekday
0,0,Organic Search,2016-09-02,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,2016-09-02 15:33:05,Chrome,...,,google,2016,9,2,4,2016,9,2,4
1,1,Organic Search,2016-09-02,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,2016-09-03 05:22:27,Firefox,...,,google,2016,9,2,4,2016,9,3,5
2,2,Organic Search,2016-09-02,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,2016-09-03 01:16:26,Chrome,...,,google,2016,9,2,4,2016,9,3,5
3,3,Organic Search,2016-09-02,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,2016-09-03 05:40:13,UC Browser,...,,google,2016,9,2,4,2016,9,3,5
4,4,Organic Search,2016-09-02,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,2016-09-02 13:23:20,Chrome,...,,google,2016,9,2,4,2016,9,2,4
5,5,Organic Search,2016-09-02,2938943183656635653,2938943183656635653_1472807194,Not Socially Engaged,1472807194,1,2016-09-02 09:06:34,Chrome,...,,google,2016,9,2,4,2016,9,2,4
6,6,Organic Search,2016-09-02,1905672039242460897,1905672039242460897_1472817241,Not Socially Engaged,1472817241,1,2016-09-02 11:54:01,Chrome,...,,google,2016,9,2,4,2016,9,2,4
7,7,Organic Search,2016-09-02,537222803633850821,537222803633850821_1472812602,Not Socially Engaged,1472812602,1,2016-09-02 10:36:42,Chrome,...,,google,2016,9,2,4,2016,9,2,4
8,8,Organic Search,2016-09-02,4445454811831400414,4445454811831400414_1472805784,Not Socially Engaged,1472805784,1,2016-09-02 08:43:04,Internet Explorer,...,,google,2016,9,2,4,2016,9,2,4
9,9,Organic Search,2016-09-02,9499785259412240342,9499785259412240342_1472812272,Not Socially Engaged,1472812272,1,2016-09-02 10:31:12,Firefox,...,,google,2016,9,2,4,2016,9,2,4


In [11]:
date_time_conversion(test_master, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(test_master,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.referralPath,trafficSource.source,date_year,date_month,date_day,date_weekday,visitStartTime_year,visitStartTime_month,visitStartTime_day,visitStartTime_weekday
0,0,Organic Search,2017-10-16,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,2017-10-16 10:50:24,Chrome,...,,google,2017,10,16,0,2017,10,16,0
1,1,Organic Search,2017-10-16,0643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,2017-10-16 17:38:42,Chrome,...,,google,2017,10,16,0,2017,10,16,0
2,2,Organic Search,2017-10-16,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,2017-10-16 08:40:20,Chrome,...,,google,2017,10,16,0,2017,10,16,0
3,3,Organic Search,2017-10-16,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,2017-10-16 22:38:50,Safari,...,,google,2017,10,16,0,2017,10,16,0
4,4,Organic Search,2017-10-16,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,2017-10-17 05:17:22,Safari,...,,google,2017,10,16,0,2017,10,17,1
5,5,Organic Search,2017-10-16,4133039884103392367,4133039884103392367_1508186358,Not Socially Engaged,1508186358,1,2017-10-16 20:39:18,Chrome,...,,google,2017,10,16,0,2017,10,16,0
6,6,Organic Search,2017-10-16,4320478850207397557,4320478850207397557_1508203650,Not Socially Engaged,1508203650,1,2017-10-17 01:27:30,Chrome,...,,google,2017,10,16,0,2017,10,17,1
7,7,Organic Search,2017-10-16,5876438247590157131,5876438247590157131_1508184397,Not Socially Engaged,1508184397,1,2017-10-16 20:06:37,Chrome,...,,google,2017,10,16,0,2017,10,16,0
8,8,Organic Search,2017-10-16,0514591268737702944,0514591268737702944_1508189652,Not Socially Engaged,1508189652,6,2017-10-16 21:34:12,Chrome,...,,google,2017,10,16,0,2017,10,16,0
9,9,Organic Search,2017-10-16,6430567031531677212,6430567031531677212_1508175502,Not Socially Engaged,1508175502,1,2017-10-16 17:38:22,Chrome,...,,google,2017,10,16,0,2017,10,16,0


## Target variable preprocessing

### required

In [12]:
# convert Null transaction values to 0
train_master["totals.transactionRevenue"].fillna(0, inplace=True)
val_master["totals.transactionRevenue"].fillna(0, inplace=True)

In [13]:
train_master['totals.transactionRevenue'] = train_master['totals.transactionRevenue'].astype('float')
val_master['totals.transactionRevenue'] = val_master['totals.transactionRevenue'].astype('float')

In [14]:
# Lets create a variable which one-hot-encodes whether a transaction value is 0 or not
transaction_series = train_master['totals.transactionRevenue'] 
train_master['is_transaction'] = transaction_series.apply(lambda x:1 if x>0 else 0)


val_transaction_series = val_master['totals.transactionRevenue'] 
val_master['is_transaction'] = val_transaction_series.apply(lambda x:1 if x>0 else 0)

In [15]:
train_master['transactionRevenue_log'] = np.log1p(train_master['totals.transactionRevenue'].values)
val_master['transactionRevenue_log'] = np.log1p(val_master['totals.transactionRevenue'].values)

### analysis

In [None]:
train_master.head()

In [None]:
val_master.head()

In [None]:
test_master.head()

## Restricting Features to analyse for train, val, test

### required

In [16]:
features = ['device.browser',
 'device.operatingSystem',
 'geoNetwork.continent',
 'geoNetwork.subContinent',
 'totals.hits',
 'totals.pageviews',
 'trafficSource.medium',
 'channelGrouping',
 'visitNumber',
 'device.isMobile',
 'totals.bounces',
 'totals.newVisits',
'device.deviceCategory',
'date_month',
'date_day',
'date_weekday',
'visitStartTime_month',
'visitStartTime_day',
'visitStartTime_weekday']



predictors = ['totals.transactionRevenue','is_transaction','transactionRevenue_log']

predictors_val = ['totals.transactionRevenue','transactionRevenue_log']

In [17]:
train = train_master[features+predictors]
test = test_master[features]
val = val_master[features+predictors_val]

### analysis

In [None]:
train.head()

In [None]:
val.head()

In [None]:
test.head()

## Encoding variables

### required 

In [18]:
encoded_objects = ['device.browser','device.operatingSystem','geoNetwork.continent','geoNetwork.subContinent',
                   'trafficSource.medium','channelGrouping']

one_hot_encoded_objects = ['device.deviceCategory','device.isMobile']

In [19]:
for column in encoded_objects:
    train = value_count_preprocess(train,column,remove = True)
    val = value_count_preprocess(val,column,remove = True)
    test = value_count_preprocess(test,column,remove=True)

In [20]:
for column in one_hot_encoded_objects:
    train = get_dummies_preprocess(train,column,remove=True)
    val = get_dummies_preprocess(val,column,remove=True)
    test = get_dummies_preprocess(test,column,remove=True)
    
    

### analysis

In [None]:
### before encoding
train.dtypes

In [None]:
test.dtypes

In [None]:
val.dtypes

In [None]:
### after encoding
train.dtypes

In [None]:
test.dtypes

In [None]:
val.dtypes

## Encoding for model stacking 

### required

In [21]:
array_length = train.shape[0]
folds = five_fold_generator(array_length)
train['folds'] = folds

In [None]:
data = {'model_1':y,'model_2': y}
df = pd.DataFrame.from_dict(data)
df

### analysis

In [34]:
train.head(10)

Unnamed: 0,totals.hits,totals.pageviews,visitNumber,totals.bounces,totals.newVisits,date_month,date_day,date_weekday,visitStartTime_month,visitStartTime_day,...,device.browser_counts,device.operatingSystem_counts,geoNetwork.continent_counts,geoNetwork.subContinent_counts,trafficSource.medium_counts,channelGrouping_counts,mobile,tablet,True,folds
0,1,1.0,1,1.0,1.0,9,2,4,9,2,...,620364,350072,223698,38443,381561,381561,0,0,0,2
1,1,1.0,1,1.0,1.0,9,2,4,9,3,...,37069,253938,15054,14893,381561,381561,0,0,0,3
2,1,1.0,1,1.0,1.0,9,2,4,9,3,...,620364,350072,198311,35780,381561,381561,0,0,0,4
3,1,1.0,1,1.0,1.0,9,2,4,9,3,...,2427,35034,223698,77800,381561,381561,0,0,0,1
4,1,1.0,2,1.0,,9,2,4,9,2,...,620364,123892,198311,58168,381561,381561,1,0,1,3
5,1,1.0,1,1.0,1.0,9,2,4,9,2,...,620364,350072,198311,35780,381561,381561,0,0,0,4
6,1,1.0,1,1.0,1.0,9,2,4,9,2,...,620364,350072,223698,59321,381561,381561,0,0,0,5
7,1,1.0,1,1.0,1.0,9,2,4,9,2,...,620364,350072,15054,14893,381561,381561,0,0,0,3
8,1,1.0,1,1.0,1.0,9,2,4,9,2,...,19375,350072,198311,59114,381561,381561,0,0,0,2
9,1,1.0,1,1.0,1.0,9,2,4,9,2,...,37069,350072,198311,59114,381561,381561,0,0,0,4


# Level 1 Base model learning 

## Building classifiers: is_transaction prediction

In [200]:
model_rus_2000 = XGBClassifier()
model_rus_5000 = XGBClassifier()
model_rus_11000 = XGBClassifier()
model_rus_25000 = XGBClassifier()
model_rus_60000 = XGBClassifier()
model_rus_200000 = XGBClassifier()

### Random under sampling 2000

In [201]:
y_rus_2000,model_rus_2000 = out_of_sample_pred_classifier(train,model_rus_2000,'is_transaction',2000)

### Random under sampling 5000

In [202]:
y_rus_5000,model_rus_5000 = out_of_sample_pred_classifier(train,model_rus_5000,'is_transaction',5000)

### Random under sampling 11000

In [203]:
y_rus_11000,model_rus_11000 = out_of_sample_pred_classifier(train,model_rus_11000,'is_transaction',11000)

### Random under sampling 25000

In [204]:
y_rus_25000,model_rus_25000 =  out_of_sample_pred_classifier(train,model_rus_25000,'is_transaction',25000)

### Random under sampling 60000

In [205]:
y_rus_60000,model_rus_60000 = out_of_sample_pred_classifier(train,model_rus_60000,'is_transaction',60000)

### Random under sampling 200000

In [206]:
y_rus_200000,model_rus_200000 = out_of_sample_pred_classifier(train,model_rus_200000,'is_transaction',200000)

## Building Regressors: transactionRevenue_log prediction 

In [207]:
model_rus_2000_R = XGBRegressor()
model_rus_5000_R = XGBRegressor()
model_rus_11000_R = XGBRegressor()
model_rus_25000_R = XGBRegressor()
model_rus_60000_R = XGBRegressor()
model_rus_200000_R = XGBRegressor()

### Random under sampling 2000

In [208]:
y_rus_2000_R, model_rus_2000_R = out_of_sample_pred_regressor(train,model_rus_2000_R,'transactionRevenue_log',2000)

### Random under sampling 5000

In [209]:
y_rus_5000_R, model_rus_5000_R = out_of_sample_pred_regressor(train,model_rus_5000_R,'transactionRevenue_log',5000)

### Random under sampling 11000

In [210]:
y_rus_11000_R, model_rus_11000_R = out_of_sample_pred_regressor(train,model_rus_11000_R,'transactionRevenue_log',11000)

### Random under sampling 25000

In [211]:
y_rus_25000_R, model_rus_25000_R = out_of_sample_pred_regressor(train,model_rus_25000_R,'transactionRevenue_log',25000)

### Random under sampling 60000

In [212]:
y_rus_60000_R, model_rus_60000_R = out_of_sample_pred_regressor(train,model_rus_60000_R,'transactionRevenue_log',60000)

### Random under sampling 200000

In [213]:
y_rus_200000_R, model_rus_200000_R = out_of_sample_pred_regressor(train,model_rus_200000_R,'transactionRevenue_log',200000)

# Level 2 Building a Meta-learner 

### constructing the dataset

### required 

In [214]:
meta_data = {'rus_C_2000':y_rus_2000, 'rus_C_5000':y_rus_5000, 'rus_C_11000': y_rus_11000, 'rus_C_25000':y_rus_25000,
             
             'rus_C_60000':y_rus_60000, 'rus_C_200000': y_rus_200000,
             
             'rus_R_2000':y_rus_2000_R, 'rus_R_5000':y_rus_5000_R, 'rus_R_11000':y_rus_11000_R, 
             
             'rus_R_25000':y_rus_25000_R, 'rus_R_60000':y_rus_60000_R, 'rus_R_200000':y_rus_200000_R
            
            
            }


In [215]:
meta_data_df = pd.DataFrame.from_dict(meta_data)

In [216]:
meta_data_df.head()

Unnamed: 0,rus_C_11000,rus_C_2000,rus_C_200000,rus_C_25000,rus_C_5000,rus_C_60000,rus_R_11000,rus_R_2000,rus_R_200000,rus_R_25000,rus_R_5000,rus_R_60000
0,0.99953,0.999136,0.999879,0.999684,0.999069,0.999778,-0.315633,-0.285802,-0.007039,-0.058256,-0.397565,-0.030023
1,0.999721,0.999262,0.999929,0.999831,0.999573,0.999868,-0.265827,-0.335796,-0.020634,-0.076748,-0.411805,-0.032236
2,0.999706,0.999153,0.999924,0.999816,0.99956,0.999867,-0.279865,-0.188584,-0.007583,-0.034536,-0.288609,-0.041321
3,0.999789,0.999314,0.999932,0.999831,0.999661,0.999904,-0.265827,-0.460753,-0.020634,-0.076748,-0.411805,-0.032236
4,0.999785,0.999355,0.999931,0.999834,0.999597,0.999889,-0.265904,-0.204725,-0.005467,-0.003232,-0.22773,0.018926


In [217]:
predictions = true_predictor_values(train,'transactionRevenue_log')

In [218]:
meta_data_df['predictions'] = predictions

In [219]:
meta_data_df.head(110000)

Unnamed: 0,rus_C_11000,rus_C_2000,rus_C_200000,rus_C_25000,rus_C_5000,rus_C_60000,rus_R_11000,rus_R_2000,rus_R_200000,rus_R_25000,rus_R_5000,rus_R_60000,predictions
0,0.999530,0.999136,0.999879,0.999684,0.999069,0.999778,-0.315633,-0.285802,-0.007039,-0.058256,-0.397565,-0.030023,0.000000
1,0.999721,0.999262,0.999929,0.999831,0.999573,0.999868,-0.265827,-0.335796,-0.020634,-0.076748,-0.411805,-0.032236,0.000000
2,0.999706,0.999153,0.999924,0.999816,0.999560,0.999867,-0.279865,-0.188584,-0.007583,-0.034536,-0.288609,-0.041321,0.000000
3,0.999789,0.999314,0.999932,0.999831,0.999661,0.999904,-0.265827,-0.460753,-0.020634,-0.076748,-0.411805,-0.032236,0.000000
4,0.999785,0.999355,0.999931,0.999834,0.999597,0.999889,-0.265904,-0.204725,-0.005467,-0.003232,-0.227730,0.018926,0.000000
5,0.999785,0.999355,0.999931,0.999834,0.999597,0.999889,-0.265904,-0.204725,-0.005467,-0.003232,-0.227730,0.018926,0.000000
6,0.999789,0.999263,0.999932,0.999831,0.999620,0.999904,-0.265827,-0.562954,-0.020634,-0.076748,-0.411805,-0.032236,0.000000
7,0.999795,0.999313,0.999932,0.999837,0.999571,0.999892,-0.299145,-0.363759,-0.007583,-0.032152,-0.306465,-0.041321,0.000000
8,0.999795,0.999313,0.999932,0.999837,0.999571,0.999892,-0.299145,-0.363759,-0.007583,-0.032152,-0.306465,-0.041321,0.000000
9,0.999795,0.999313,0.999932,0.999837,0.999571,0.999892,-0.299145,-0.363759,-0.007583,-0.032152,-0.306465,-0.041321,0.000000


### Building the model

In [113]:
model_meta = XGBRegressor()

In [119]:
features = [column for column in meta_data_df.columns if column != 'predictions']

In [120]:
features

['rus_C_11000',
 'rus_C_2000',
 'rus_C_200000',
 'rus_C_25000',
 'rus_C_5000',
 'rus_C_60000',
 'rus_R_11000',
 'rus_R_2000',
 'rus_R_200000',
 'rus_R_25000',
 'rus_R_5000',
 'rus_R_60000']

In [121]:
X_meta_data = meta_data_df[features]
y_meta_data = meta_data_df['predictions']

In [122]:
model_meta.fit(X_meta_data,y_meta_data)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## predicting on validation set

In [220]:
features_val = [column for column in val.columns 
                if (column != 'is_transaction' and column != 'totals.transactionRevenue' 
                    and column != 'transactionRevenue_log')]

In [221]:
X_val = val[features_val]
y_val = val['transactionRevenue_log']

### Predict classifiers

In [222]:
y_rus_2000_val = get_predict_proba(X_val, model_rus_2000)
y_rus_5000_val = get_predict_proba(X_val, model_rus_5000)
y_rus_11000_val = get_predict_proba(X_val, model_rus_11000)
y_rus_25000_val = get_predict_proba(X_val, model_rus_25000)
y_rus_60000_val = get_predict_proba(X_val, model_rus_60000)
y_rus_200000_val = get_predict_proba(X_val, model_rus_200000)

In [223]:
y_rus_2000_R_val = model_rus_2000_R.predict(X_val)
y_rus_5000_R_val = model_rus_5000_R.predict(X_val)
y_rus_11000_R_val = model_rus_11000_R.predict(X_val)
y_rus_25000_R_val = model_rus_25000_R.predict(X_val)
y_rus_60000_R_val = model_rus_60000_R.predict(X_val)
y_rus_200000_R_val = model_rus_200000_R.predict(X_val)

In [227]:
meta_data_val = {'rus_C_2000':y_rus_2000_val, 'rus_C_5000':y_rus_5000_val, 'rus_C_11000': y_rus_11000_val,
                 'rus_C_25000':y_rus_25000_val,
             
             'rus_C_60000':y_rus_60000_val, 'rus_C_200000': y_rus_200000_val,
             
             'rus_R_2000':y_rus_2000_R_val, 'rus_R_5000':y_rus_5000_R_val, 'rus_R_11000':y_rus_11000_R_val, 
             
             'rus_R_25000':y_rus_25000_R_val, 'rus_R_60000':y_rus_60000_R_val, 'rus_R_200000':y_rus_200000_R_val
            
            
            }

In [228]:
meta_data_val_df = pd.DataFrame.from_dict(meta_data_val)

In [232]:
meta_data_val_df['predictions'] = y_val

In [238]:
meta_data_val_df.head(10000)

Unnamed: 0,rus_C_11000,rus_C_2000,rus_C_200000,rus_C_25000,rus_C_5000,rus_C_60000,rus_R_11000,rus_R_2000,rus_R_200000,rus_R_25000,rus_R_5000,rus_R_60000,predictions
0,0.998219,0.986077,0.999831,0.998959,0.997620,0.999573,-0.092026,0.546617,0.046943,0.129029,0.150878,0.019389,0.000000
1,0.999016,0.996497,0.999883,0.999495,0.998962,0.999734,-0.306519,-0.293146,0.006483,-0.031730,-0.276419,-0.037231,0.000000
2,0.999125,0.997085,0.999887,0.999557,0.998525,0.999752,-0.304971,-0.455099,0.006483,-0.044092,-0.290853,-0.051360,0.000000
3,0.998937,0.992867,0.999883,0.999393,0.998467,0.999698,-0.240338,-0.187617,0.008384,0.043107,-0.173559,-0.030635,0.000000
4,0.999055,0.993919,0.999899,0.999444,0.998812,0.999707,-0.236674,-0.390601,0.008384,-0.034468,-0.159125,-0.016505,0.000000
5,0.998806,0.994675,0.999897,0.999511,0.999241,0.999723,-0.229400,-0.162733,0.008384,-0.003654,-0.283722,-0.030635,0.000000
6,0.998590,0.993246,0.999891,0.999362,0.997872,0.999646,0.132305,0.140528,0.008050,0.063014,0.054276,0.045739,0.000000
7,0.998937,0.992867,0.999883,0.999393,0.998467,0.999698,-0.240338,-0.187617,0.008384,0.043107,-0.173559,-0.030635,0.000000
8,0.998590,0.992867,0.999878,0.999362,0.997872,0.999646,0.132305,0.246515,0.008050,0.063014,0.054276,0.045739,0.000000
9,0.999055,0.993753,0.999887,0.999444,0.998467,0.999712,-0.237732,-0.317093,0.008384,0.034722,-0.173559,-0.030635,0.000000


In [235]:
features_val = [column for column in meta_data_df.columns if column != 'predictions']

In [236]:
y_meta_val_test = model_meta.predict(meta_data_val_df[features_val])

In [239]:
y_meta_val_test[y_meta_val_test<0]=0

In [240]:
mean_squared_error(y_meta_val_test,meta_data_val_df['predictions'])**0.5

2.1175512299952914

### Random under sampling 500,000

In [None]:
model_rus = XGBClassifier()

In [None]:
train_rus = random_under_sample(train,'is_transaction',800000)

In [None]:
X_rus = train_rus[features]
y_rus = train_rus['is_transaction']

In [None]:
model_rus.fit(X_rus,y_rus)

In [None]:
y_rus_val = model_rus.predict(X_val)

In [None]:
confusion_matrix(y_val,y_rus_val)

### Random under sampling 5,000

In [None]:
model_rus_0 = XGBClassifier()

In [None]:
train_rus_0 = random_under_sample(train,'is_transaction',5000)

In [None]:
X_rus_0 = train_rus_0[features]
y_rus_0 = train_rus_0['is_transaction']

In [None]:
model_rus_0.fit(X_rus_0,y_rus_0)

In [None]:
y_rus_val_0 = model_rus_0.predict(X_val)

In [None]:
confusion_matrix(y_val,y_rus_val_0)

### Random under sampling 20,000   

In [None]:
model_rus_1 = XGBClassifier()

In [None]:
train_rus_1 = random_under_sample(train,'is_transaction',20000)



In [None]:
X_rus_1 = train_rus_1[features]
y_rus_1 = train_rus_1['is_transaction']

In [None]:
model_rus_1.fit(X_rus_1,y_rus_1)

In [None]:
y_rus_val_1 = model_rus_1.predict(X_val)

In [None]:
confusion_matrix(y_val,y_rus_val_1)

### Random under sampling 11,000

In [None]:
model_rus_2 = XGBClassifier()

In [None]:
train_rus_2 = random_under_sample(train,'is_transaction',11000)

In [None]:
X_rus_2 = train_rus_2[features]
y_rus_2 = train_rus_2['is_transaction']

In [None]:
model_rus_2.fit(X_rus_2,y_rus_2)

In [None]:
y_rus_val_2 = model_rus_2.predict(X_val)

In [None]:
confusion_matrix(y_val,y_rus_val_2)

### Random under sampling 100,000

In [None]:
model_rus_3 = XGBClassifier()

In [None]:
train_rus_3 = random_under_sample(train,'is_transaction',100000)

In [None]:
X_rus_3 = train_rus_3[features]
y_rus_3 = train_rus_3['is_transaction']

In [None]:
model_rus_3.fit(X_rus_3,y_rus_3)

In [None]:
y_rus_val_3 = model_rus_3.predict(X_val)

In [None]:
confusion_matrix(y_val,y_rus_val_3)

### no sampling

In [None]:
model_1 = XGBClassifier()

In [None]:
X= train[features]
y= train['is_transaction']

In [None]:
model_1.fit(X,y)

In [None]:
y_val_test = model_1.predict(X_val)

In [None]:
confusion_matrix(y_val,y_val_test)

## Regression Problem: transactionRevenue_log prediction

In [241]:
model_reg = XGBRegressor()

In [243]:
features_1 = [column for column in train.columns 
                if (column != 'is_transaction' and column != 'totals.transactionRevenue' 
                    and column != 'transactionRevenue_log' and column != 'folds')]

In [245]:
X = train[features_1]
y = train['transactionRevenue_log']

In [246]:
model_reg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [247]:
X_val = val[features_1]
y_val = val['transactionRevenue_log']

In [248]:
y_val_pred = model_reg.predict(X_val)

In [249]:
# Set negative predictions to 0
y_val_pred[y_val_pred<0]=0

In [250]:
pred_baseline = prediction_df(val_master,'fullVisitorId','totals.transactionRevenue',y_val_pred)
pred_baseline.head(1000)

Unnamed: 0,fullVisitorId,totals.transactionRevenue,prediction,log_totals.transactionRevenue,log_prediction
0,26722803385797,0.0,0.002138,0.000000,0.002136
1,48421062322244,0.0,0.000659,0.000000,0.000659
2,62267706107999,0.0,0.000659,0.000000,0.000659
3,62349695125717,0.0,0.000659,0.000000,0.000659
4,64767209884626,0.0,0.004009,0.000000,0.004001
5,68403966359845,0.0,0.000905,0.000000,0.000905
6,85059828173212,0.0,0.000000,0.000000,0.000000
7,90085033332104,0.0,0.000905,0.000000,0.000905
8,95855025602688,0.0,0.002138,0.000000,0.002136
9,143747325766784,0.0,0.000000,0.000000,0.000000


In [252]:
mean_squared_error(pred_baseline['log_totals.transactionRevenue'],pred_baseline['log_prediction'])**0.5

2.172517812263597

In [None]:
x=10
n = 15
x = 100 if n else x
x