In [64]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import random
from random import randrange
from sklearn.metrics import confusion_matrix

In [25]:


def column_type(dataFrame):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:[] for object_type in type_keys }
    
    for column in dataFrame.columns:
        Dict[dataFrame[column].dtypes].append(column)
    
    return Dict


def column_counts(dataFrame, threshold):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:{} for object_type in type_keys}
    
    for column in dataFrame.columns:
        
        if len(dataFrame[column].unique()) <= threshold:
            
            Dict[dataFrame[column].dtype][column] = len(dataFrame[column].unique())
            
    return Dict


def value_count_preprocess(dataFrame,column, remove = False):
    
    value_count_df = dataFrame[column].value_counts()
    value_count_df_reshaped = value_count_df.rename_axis(column).reset_index(name = column+'_counts')
    dataFrame = pd.merge(dataFrame, value_count_df_reshaped, on = column, how = 'left')
    
    if remove:
        
        
        dataFrame.drop(columns=[column],inplace=True)
    return dataFrame


def get_dummies_preprocess(dataFrame, column, remove = False):
    dataFrame_encoded = pd.get_dummies(dataFrame[column],drop_first = True)
    dataFrame = pd.concat([dataFrame,dataFrame_encoded],axis = 1)
    if remove:
        dataFrame.drop(columns = [column],inplace=True)
    return dataFrame


def date_time_conversion(dataFrame, column, year = 0, month = 0, day = 0, weekday = 0, unit = None ,errors = None , format = None):
    
    dataFrame[column] = pd.to_datetime(dataFrame[column], format = format, errors = errors, unit = unit)
    
    if year: 
        dataFrame[column+'_year'] = dataFrame[column].apply(lambda x: x.year)
    
    if month:
        dataFrame[column+'_month'] = dataFrame[column].apply(lambda x: x.month)
        
    if day:
        dataFrame[column+'_day'] = dataFrame[column].apply(lambda x: x.day)
        
    if weekday:
        dataFrame[column+'_weekday'] = dataFrame[column].apply(lambda x: x.weekday())
    
    return dataFrame



def random_under_sample(dataFrame, column, sample_size):
    
    dataFrame_0 = dataFrame[dataFrame[column]==0]
    
    dataFrame_1 = dataFrame[dataFrame[column]==1]
    
    dataFrame_0_index_list = list(dataFrame_0.index.values)
    
    dataFrame_0_rand_index = random.sample(dataFrame_0_index_list,sample_size)
    
    dataFrame_0_rand_index = list(set(dataFrame_0_rand_index))
    
    dataFrame_0_rus = dataFrame_0.loc[dataFrame_0_rand_index,:]
    
    frames = [dataFrame_1,dataFrame_0_rus]
    
    return pd.concat(frames)
        
    
    

### required 

In [4]:
train = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/train_flattened.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
val = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/val.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
test = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/test_flattened.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
val.head()

## Preprocessing date and time features

### required

In [7]:
train["totals.transactionRevenue"].fillna(0, inplace=True)
val["totals.transactionRevenue"].fillna(0, inplace = True)

In [8]:
date_time_conversion(train, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(train,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.referralPath,trafficSource.source,date_year,date_month,date_day,date_weekday,visitStartTime_year,visitStartTime_month,visitStartTime_day,visitStartTime_weekday
0,0,Organic Search,2016-09-02,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,2016-09-02 15:33:05,Chrome,...,,google,2016,9,2,4,2016,9,2,4
1,1,Organic Search,2016-09-02,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,2016-09-03 05:22:27,Firefox,...,,google,2016,9,2,4,2016,9,3,5
2,2,Organic Search,2016-09-02,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,2016-09-03 01:16:26,Chrome,...,,google,2016,9,2,4,2016,9,3,5
3,3,Organic Search,2016-09-02,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,2016-09-03 05:40:13,UC Browser,...,,google,2016,9,2,4,2016,9,3,5
4,4,Organic Search,2016-09-02,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,2016-09-02 13:23:20,Chrome,...,,google,2016,9,2,4,2016,9,2,4
5,5,Organic Search,2016-09-02,2938943183656635653,2938943183656635653_1472807194,Not Socially Engaged,1472807194,1,2016-09-02 09:06:34,Chrome,...,,google,2016,9,2,4,2016,9,2,4
6,6,Organic Search,2016-09-02,1905672039242460897,1905672039242460897_1472817241,Not Socially Engaged,1472817241,1,2016-09-02 11:54:01,Chrome,...,,google,2016,9,2,4,2016,9,2,4
7,7,Organic Search,2016-09-02,537222803633850821,537222803633850821_1472812602,Not Socially Engaged,1472812602,1,2016-09-02 10:36:42,Chrome,...,,google,2016,9,2,4,2016,9,2,4
8,8,Organic Search,2016-09-02,4445454811831400414,4445454811831400414_1472805784,Not Socially Engaged,1472805784,1,2016-09-02 08:43:04,Internet Explorer,...,,google,2016,9,2,4,2016,9,2,4
9,9,Organic Search,2016-09-02,9499785259412240342,9499785259412240342_1472812272,Not Socially Engaged,1472812272,1,2016-09-02 10:31:12,Firefox,...,,google,2016,9,2,4,2016,9,2,4


In [9]:
date_time_conversion(test, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(test,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,...,trafficSource.referralPath,trafficSource.source,date_year,date_month,date_day,date_weekday,visitStartTime_year,visitStartTime_month,visitStartTime_day,visitStartTime_weekday
0,0,Organic Search,2017-10-16,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,2017-10-16 10:50:24,Chrome,...,,google,2017,10,16,0,2017,10,16,0
1,1,Organic Search,2017-10-16,0643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,2017-10-16 17:38:42,Chrome,...,,google,2017,10,16,0,2017,10,16,0
2,2,Organic Search,2017-10-16,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,2017-10-16 08:40:20,Chrome,...,,google,2017,10,16,0,2017,10,16,0
3,3,Organic Search,2017-10-16,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,2017-10-16 22:38:50,Safari,...,,google,2017,10,16,0,2017,10,16,0
4,4,Organic Search,2017-10-16,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,2017-10-17 05:17:22,Safari,...,,google,2017,10,16,0,2017,10,17,1
5,5,Organic Search,2017-10-16,4133039884103392367,4133039884103392367_1508186358,Not Socially Engaged,1508186358,1,2017-10-16 20:39:18,Chrome,...,,google,2017,10,16,0,2017,10,16,0
6,6,Organic Search,2017-10-16,4320478850207397557,4320478850207397557_1508203650,Not Socially Engaged,1508203650,1,2017-10-17 01:27:30,Chrome,...,,google,2017,10,16,0,2017,10,17,1
7,7,Organic Search,2017-10-16,5876438247590157131,5876438247590157131_1508184397,Not Socially Engaged,1508184397,1,2017-10-16 20:06:37,Chrome,...,,google,2017,10,16,0,2017,10,16,0
8,8,Organic Search,2017-10-16,0514591268737702944,0514591268737702944_1508189652,Not Socially Engaged,1508189652,6,2017-10-16 21:34:12,Chrome,...,,google,2017,10,16,0,2017,10,16,0
9,9,Organic Search,2017-10-16,6430567031531677212,6430567031531677212_1508175502,Not Socially Engaged,1508175502,1,2017-10-16 17:38:22,Chrome,...,,google,2017,10,16,0,2017,10,16,0


## Target variable preprocessing

### required

In [10]:
# convert Null transaction values to 0
train["totals.transactionRevenue"].fillna(0, inplace=True)
val["totals.transactionRevenue"].fillna(0, inplace=True)

In [11]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype('float')
val['totals.transactionRevenue'] = val['totals.transactionRevenue'].astype('float')

In [32]:
# Lets create a variable which one-hot-encodes whether a transaction value is 0 or not
transaction_series = train['totals.transactionRevenue'] 
train['is_transaction'] = transaction_series.apply(lambda x:1 if x>0 else 0)


val_transaction_series = val['totals.transactionRevenue'] 
val['is_transaction'] = val_transaction_series.apply(lambda x:1 if x>0 else 0)

In [13]:
train['transactionRevenue_log'] = np.log1p(train['totals.transactionRevenue'].values)
val['transactionRevenue_log'] = np.log1p(val['totals.transactionRevenue'].values)

### analysis

In [None]:
train.head()

In [None]:
val.head()

In [None]:
test.head()

## Restricting Features to analyse for train, val, test

### required

In [14]:
features = ['device.browser',
 'device.operatingSystem',
 'geoNetwork.continent',
 'geoNetwork.subContinent',
 'totals.hits',
 'totals.pageviews',
 'trafficSource.medium',
 'channelGrouping',
 'visitNumber',
 'device.isMobile',
 'totals.bounces',
 'totals.newVisits',
'device.deviceCategory',
'date_month',
'date_day',
'date_weekday',
'visitStartTime_month',
'visitStartTime_day',
'visitStartTime_weekday']



predictors = ['totals.transactionRevenue','is_transaction','transactionRevenue_log']

predictors_val = ['totals.transactionRevenue','transactionRevenue_log']

In [15]:
train = train[features+predictors]
test = test[features]
val = val[features+predictors_val]

### analysis

In [None]:
train.head()

In [None]:
val.head()

In [None]:
test.head()

## Encoding variables

### required 

In [16]:
encoded_objects = ['device.browser','device.operatingSystem','geoNetwork.continent','geoNetwork.subContinent',
                   'trafficSource.medium','channelGrouping']

one_hot_encoded_objects = ['device.deviceCategory','device.isMobile']

In [17]:
for column in encoded_objects:
    train = value_count_preprocess(train,column,remove = True)
    val = value_count_preprocess(val,column,remove = True)
    test = value_count_preprocess(test,column,remove=True)

In [18]:
for column in one_hot_encoded_objects:
    train = get_dummies_preprocess(train,column,remove=True)
    val = get_dummies_preprocess(val,column,remove=True)
    test = get_dummies_preprocess(test,column,remove=True)
    
    

### analysis

In [None]:
### before encoding
train.dtypes

In [None]:
test.dtypes

In [None]:
val.dtypes

In [19]:
### after encoding
train.dtypes

totals.hits                         int64
totals.pageviews                  float64
visitNumber                         int64
totals.bounces                    float64
totals.newVisits                  float64
date_month                          int64
date_day                            int64
date_weekday                        int64
visitStartTime_month                int64
visitStartTime_day                  int64
visitStartTime_weekday              int64
totals.transactionRevenue         float64
is_transaction                      int64
transactionRevenue_log            float64
device.browser_counts               int64
device.operatingSystem_counts       int64
geoNetwork.continent_counts         int64
geoNetwork.subContinent_counts      int64
trafficSource.medium_counts         int64
channelGrouping_counts              int64
mobile                              uint8
tablet                              uint8
True                                uint8
dtype: object

In [20]:
test.dtypes

totals.hits                         int64
totals.pageviews                  float64
visitNumber                         int64
totals.bounces                    float64
totals.newVisits                  float64
date_month                          int64
date_day                            int64
date_weekday                        int64
visitStartTime_month                int64
visitStartTime_day                  int64
visitStartTime_weekday              int64
device.browser_counts               int64
device.operatingSystem_counts       int64
geoNetwork.continent_counts         int64
geoNetwork.subContinent_counts      int64
trafficSource.medium_counts         int64
channelGrouping_counts              int64
mobile                              uint8
tablet                              uint8
True                                uint8
dtype: object

In [21]:
val.dtypes

totals.hits                         int64
totals.pageviews                  float64
visitNumber                         int64
totals.bounces                    float64
totals.newVisits                  float64
date_month                          int64
date_day                            int64
date_weekday                        int64
visitStartTime_month                int64
visitStartTime_day                  int64
visitStartTime_weekday              int64
totals.transactionRevenue         float64
transactionRevenue_log            float64
device.browser_counts               int64
device.operatingSystem_counts       int64
geoNetwork.continent_counts         int64
geoNetwork.subContinent_counts      int64
trafficSource.medium_counts         int64
channelGrouping_counts              int64
mobile                              uint8
tablet                              uint8
True                                uint8
dtype: object

## New feature list 

### Required

In [22]:
features = []
for column in train.columns:
    if column != 'is_transaction' and column != 'totals.transactionRevenue' and column != 'transactionRevenue_log':
        features.append(column)
features

['totals.hits',
 'totals.pageviews',
 'visitNumber',
 'totals.bounces',
 'totals.newVisits',
 'date_month',
 'date_day',
 'date_weekday',
 'visitStartTime_month',
 'visitStartTime_day',
 'visitStartTime_weekday',
 'device.browser_counts',
 'device.operatingSystem_counts',
 'geoNetwork.continent_counts',
 'geoNetwork.subContinent_counts',
 'trafficSource.medium_counts',
 'channelGrouping_counts',
 'mobile',
 'tablet',
 True]

## Classification problem: is_transaction prediction

In [None]:
model = XGBClassifier()

In [None]:
X = train[features]
y = train['is_transaction']

In [None]:
model.fit(X,y)

In [None]:
model.predict(X).sum()

In [None]:
y.sum()

In [None]:
val_transaction_series = val['totals.transactionRevenue'] 
val['is_transaction'] = val_transaction_series.apply(lambda x:1 if x>0 else 0)

In [33]:
X_val = val[features]
y_val = val['is_transaction']

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_val_test = model.predict(X_val)

In [None]:
confusion_matrix(y_val,y_val_test)

### Random under sampling 500,000

In [26]:
model_rus = XGBClassifier()

In [27]:
train_rus = random_under_sample(train,'is_transaction',800000)

In [28]:
X_rus = train_rus[features]
y_rus = train_rus['is_transaction']

In [29]:
model_rus.fit(X_rus,y_rus)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [34]:
y_rus_val = model_rus.predict(X_val)

  if diff:


In [37]:
confusion_matrix(y_val,y_rus_val)

array([[267033,      0],
       [  4063,      0]])

### Random under sampling 5,000

In [38]:
model_rus_0 = XGBClassifier()

In [39]:
train_rus_0 = random_under_sample(train,'is_transaction',5000)

In [40]:
X_rus_0 = train_rus_0[features]
y_rus_0 = train_rus_0['is_transaction']

In [41]:
model_rus_0.fit(X_rus_0,y_rus_0)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [42]:
y_rus_val_0 = model_rus_0.predict(X_val)

  if diff:


In [43]:
confusion_matrix(y_val,y_rus_val_0)

array([[254484,  12549],
       [   334,   3729]])

### Random under sampling 20,000   

In [44]:
model_rus_1 = XGBClassifier()

In [45]:
train_rus_1 = random_under_sample(train,'is_transaction',20000)



In [46]:
X_rus_1 = train_rus_1[features]
y_rus_1 = train_rus_1['is_transaction']

In [47]:
model_rus_1.fit(X_rus_1,y_rus_1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [48]:
y_rus_val_1 = model_rus_1.predict(X_val)

  if diff:


In [49]:
confusion_matrix(y_val,y_rus_val_1)

array([[263965,   3068],
       [  2374,   1689]])

### Random under sampling 11,000

In [50]:
model_rus_2 = XGBClassifier()

In [51]:
train_rus_2 = random_under_sample(train,'is_transaction',11000)

In [52]:
X_rus_2 = train_rus_2[features]
y_rus_2 = train_rus_2['is_transaction']

In [53]:
model_rus_2.fit(X_rus_2,y_rus_2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [54]:
y_rus_val_2 = model_rus_2.predict(X_val)

  if diff:


In [55]:
confusion_matrix(y_val,y_rus_val_2)

array([[260772,   6261],
       [  1149,   2914]])

### Random under sampling 100,000

In [56]:
model_rus_3 = XGBClassifier()

In [57]:
train_rus_3 = random_under_sample(train,'is_transaction',100000)

In [58]:
X_rus_3 = train_rus_3[features]
y_rus_3 = train_rus_3['is_transaction']

In [59]:
model_rus_3.fit(X_rus_3,y_rus_3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [60]:
y_rus_val_3 = model_rus_3.predict(X_val)

  if diff:


In [61]:
confusion_matrix(y_val,y_rus_val_3)

array([[266931,    102],
       [  3987,     76]])

### no sampling

In [None]:
model_1 = XGBClassifier()

In [None]:
X= train[features]
y= train['is_transaction']

In [None]:
model_1.fit(X,y)

In [None]:
y_val_test = model_1.predict(X_val)

In [None]:
confusion_matrix(y_val,y_val_test)

## Regression Problem: transactionRevenue_log prediction