In [81]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
import random
from random import randrange

In [None]:


def column_type(dataFrame):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:[] for object_type in type_keys }
    
    for column in dataFrame.columns:
        Dict[dataFrame[column].dtypes].append(column)
    
    return Dict


def column_counts(dataFrame, threshold):
    
    type_keys = np.array(dataFrame.dtypes.unique())
    
    Dict = {object_type:{} for object_type in type_keys}
    
    for column in dataFrame.columns:
        
        if len(dataFrame[column].unique()) <= threshold:
            
            Dict[dataFrame[column].dtype][column] = len(dataFrame[column].unique())
            
    return Dict


def value_count_preprocess(dataFrame,column, remove = False):
    
    value_count_df = dataFrame[column].value_counts()
    value_count_df_reshaped = value_count_df.rename_axis(column).reset_index(name = column+'_counts')
    dataFrame = pd.merge(dataFrame, value_count_df_reshaped, on = column, how = 'left')
    
    if remove:
        
        
        dataFrame.drop(columns=[column],inplace=True)
    return dataFrame


def get_dummies_preprocess(dataFrame, column, remove = False):
    dataFrame_encoded = pd.get_dummies(dataFrame[column],drop_first = True)
    dataFrame = pd.concat([dataFrame,dataFrame_encoded],axis = 1)
    if remove:
        dataFrame.drop(columns = [column],inplace=True)
    return dataFrame


def date_time_conversion(dataFrame, column, year = 0, month = 0, day = 0, weekday = 0, unit = None ,errors = None , format = None):
    
    dataFrame[column] = pd.to_datetime(dataFrame[column], format = format, errors = errors, unit = unit)
    
    if year: 
        dataFrame[column+'_year'] = dataFrame[column].apply(lambda x: x.year)
    
    if month:
        dataFrame[column+'_month'] = dataFrame[column].apply(lambda x: x.month)
        
    if day:
        dataFrame[column+'_day'] = dataFrame[column].apply(lambda x: x.day)
        
    if weekday:
        dataFrame[column+'_weekday'] = dataFrame[column].apply(lambda x: x.weekday())
    
    return dataFrame



def random_under_sample(dataFrame, column, sample_size):
    
    dataFrame_0 = dataFrame[dataFrame[column]==0]
    
    dataFrame_1 = dataFrame[dataFrame[column]==1]
    
    dataFrame_0_index_list = list(dataFrame_0.index.values)
    
    dataFrame_0_rand_index = random.sample(train_0_index_list,sample_size)
    
    dataFrame_0_rand_index = list(set(train_0_rand_index))
    
    dataFrame_0_rus = dataFrame_0.loc[dataFrame_0_rand_index,:]
    
    frames = [dataFrame_1,dataFrame_0_rus]
    
    return pd.concat(frames)
        
    
    

### required 

In [None]:
train = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/train_flattened.csv')

In [None]:
val = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/val.csv')

In [None]:
test = pd.read_csv('/Users/psangha/Desktop/Kaggle/Kaggle-Google-Analytics-Customer-Revenue/data/test_flattened.csv')

### analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
val.head()

## Preprocessing date and time features

### required

In [None]:
train["totals.transactionRevenue"].fillna(0, inplace=True)
val["totals.transactionRevenue"].fillna(0, inplace = True)

In [None]:
date_time_conversion(train, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(train,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

In [None]:
date_time_conversion(test, column = 'date', year = 1, month = 1, day = 1, weekday = 1, unit = None, errors = 'ignore', format = '%Y%m%d')
date_time_conversion(test,column='visitStartTime', year = 1, month = 1, day = 1, weekday = 1,unit='s',errors='ignore')

## Target variable preprocessing

### required

In [None]:
# convert Null transaction values to 0
train["totals.transactionRevenue"].fillna(0, inplace=True)
val["totals.transactionRevenue"].fillna(0, inplace=True)

In [None]:
train['totals.transactionRevenue'] = train['totals.transactionRevenue'].astype('float')
val['totals.transactionRevenue'] = val['totals.transactionRevenue'].astype('float')

In [None]:
# Lets create a variable which one-hot-encodes whether a transaction value is 0 or not
transaction_series = train['totals.transactionRevenue'] 
train['is_transaction'] = transaction_series.apply(lambda x:1 if x>0 else 0)


val_transaction_series = val['totals.transactionRevenue'] 
val['is_transaction'] = val_transaction_series.apply(lambda x:1 if x>0 else 0)

In [None]:
train['transactionRevenue_log'] = np.log1p(train['totals.transactionRevenue'].values)
val['transactionRevenue_log'] = np.log1p(val['totals.transactionRevenue'].values)

### analysis

In [None]:
train.head()

In [None]:
val.head()

In [None]:
test.head()

## Restricting Features to analyse for train, val, test

### required

In [None]:
features = ['device.browser',
 'device.operatingSystem',
 'geoNetwork.continent',
 'geoNetwork.subContinent',
 'totals.hits',
 'totals.pageviews',
 'trafficSource.medium',
 'channelGrouping',
 'visitNumber',
 'device.isMobile',
 'totals.bounces',
 'totals.newVisits',
'device.deviceCategory',
'date_month',
'date_day',
'date_weekday',
'visitStartTime_month',
'visitStartTime_day',
'visitStartTime_weekday']



predictors = ['totals.transactionRevenue','is_transaction','transactionRevenue_log']

predictors_val = ['totals.transactionRevenue','transactionRevenue_log']

In [None]:
train = train[features+predictors]
test = test[features]
val = val[features+predictors_val]

### analysis

In [None]:
train.head()

In [None]:
val.head()

In [None]:
test.head()

## Encoding variables

### required 

In [None]:
encoded_objects = ['device.browser','device.operatingSystem','geoNetwork.continent','geoNetwork.subContinent',
                   'trafficSource.medium','channelGrouping']

one_hot_encoded_objects = ['device.deviceCategory','device.isMobile']

In [None]:
for column in encoded_objects:
    train = value_count_preprocess(train,column,remove = True)
    val = value_count_preprocess(val,column,remove = True)
    test = value_count_preprocess(test,column,remove=True)

In [None]:
for column in one_hot_encoded_objects:
    train = get_dummies_preprocess(train,column,remove=True)
    val = get_dummies_preprocess(val,column,remove=True)
    test = get_dummies_preprocess(test,column,remove=True)
    
    

### analysis

In [None]:
### before encoding
train.dtypes

In [None]:
test.dtypes

In [None]:
val.dtypes

In [None]:
### after encoding
train.dtypes

In [None]:
test.dtypes

In [None]:
val.dtypes

## New feature list 

In [None]:
features = []
for column in train.columns:
    if column != 'is_transaction' and column != 'totals.transactionRevenue' and column != 'transactionRevenue_log':
        features.append(column)
features

In [None]:
model = XGBClassifier()

In [None]:
X = train[features]
y = train['is_transaction']

In [None]:
model.fit(X,y)

In [None]:
model.predict(X).sum()

In [None]:
y.sum()

In [None]:
val_transaction_series = val['totals.transactionRevenue'] 
val['is_transaction'] = val_transaction_series.apply(lambda x:1 if x>0 else 0)

In [None]:
X_val = val[features]
y_val = val['is_transaction']

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_val_test = model.predict(X_val)

In [None]:
confusion_matrix(y_val,y_val_test)

### Random under sampling 500,000

In [179]:
model_rus = XGBClassifier()

In [181]:
train_rus = random_under_sample(train,'is_transaction',800000)

In [182]:
X_rus = train_rus[features]
y_rus = train_rus['is_transaction']

In [183]:
model_rus.fit(X_rus,y_rus)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [184]:
y_rus_val = model_rus.predict(X_val)

  if diff:


In [185]:
confusion_matrix(y_val,y_rus_val)

array([[266029,   1004],
       [  3217,    846]])

### Random under sampling 5,000

In [166]:
model_rus_0 = XGBClassifier()

In [172]:
train_rus_0 = random_under_sample(train,'is_transaction',5000)

In [168]:
X_rus_0 = train_rus_0[features]
y_rus_0 = train_rus_0['is_transaction']

In [169]:
model_rus_0.fit(X_rus_0,y_rus_0)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [170]:
y_rus_val_0 = model_rus_0.predict(X_val)

  if diff:


In [171]:
confusion_matrix(y_val,y_rus_val_0)

array([[266029,   1004],
       [  3217,    846]])

### Random under sampling 20,000   

In [137]:
model_rus_1 = XGBClassifier()

In [138]:
train_rus_1 = random_under_sample(train,'is_transaction',20000)



In [139]:
X_rus_1 = train_rus_1[features]
y_rus_1 = train_rus_1['is_transaction']

In [140]:
model_rus_1.fit(X_rus_1,y_rus_1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [141]:
y_rus_val_1 = model_rus_1.predict(X_val)

  if diff:


In [142]:
confusion_matrix(y_val,y_rus_val_1)

array([[266029,   1004],
       [  3217,    846]])

### Random under sampling 11,000

In [146]:
model_rus_2 = XGBClassifier()

In [147]:
train_rus_2 = random_under_sample(train,'is_transaction',11000)

In [148]:
X_rus_2 = train_rus_2[features]
y_rus_2 = train_rus_2['is_transaction']

In [149]:
model_rus_2.fit(X_rus_2,y_rus_2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [150]:
y_rus_val_2 = model_rus_2.predict(X_val)

  if diff:


In [151]:
confusion_matrix(y_val,y_rus_val_2)

array([[266029,   1004],
       [  3217,    846]])

### Random under sampling 100,000

In [152]:
model_rus_3 = XGBClassifier()

In [153]:
train_rus_3 = random_under_sample(train,'is_transaction',100000)

In [154]:
X_rus_3 = train_rus_3[features]
y_rus_3 = train_rus_3['is_transaction']

In [155]:
model_rus_3.fit(X_rus_3,y_rus_3)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [156]:
y_rus_val_3 = model_rus_3.predict(X_val)

  if diff:


In [157]:
confusion_matrix(y_val,y_rus_val_3)

array([[266029,   1004],
       [  3217,    846]])

### no sampling

In [160]:
model_1 = XGBClassifier()

In [161]:
X= train[features]
y= train['is_transaction']

In [162]:
model_1.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [163]:
y_val_test = model_1.predict(X_val)

  if diff:


In [164]:
confusion_matrix(y_val,y_val_test)

array([[267033,      0],
       [  4063,      0]])