# TalkingData (Kaggle)
## Pre-processing

### Import

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
!pip install lightgbm
import lightgbm as lgb
#import xgboost as xgb



You are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


### Load in data

In [None]:
DATA_PATH = r"C:\Users\nguy3409\TalkingData-master"

def load_data(data_path=DATA_PATH):
    # PATHS TO FILE
    train_path = os.path.join(data_path, "train.csv")
    test_path = os.path.join(data_path, "test.csv")
    ssize = 50000000
    return pd.read_csv(train_path,nrows=ssize), pd.read_csv(test_path)
    #return pd.read_csv(train_path), pd.read_csv(test_path)

train, test = load_data()

### Data Exploration

In [None]:
# Training sample
print(train.shape)
train.head()

We notice that all the missing values in 'attributed_time' are for observations that did not convert into a download ('is_attributed'=0).

In [None]:
# Plot the proportion of clicks that converted into a download or not
plt.figure(figsize=(6,6))
#sns.set(font_scale=1.2)
mean = (train.is_attributed.values == 1).mean()
ax = sns.barplot(['Converted (1)', 'Not Converted (0)'], [mean, 1-mean])
ax.set(ylabel='Proportion', title='Proportion of clicks converted into app downloads')
for p, uniq in zip(ax.patches, [mean, 1-mean]):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height+0.01,
            '{}%'.format(round(uniq * 100, 2)),
            ha="center")

### Undersampling
Sample the data using random undersampling

In [None]:
# Separate the 2 classes
train_0 = train[train['is_attributed'] == 0]
train_1 = train[train['is_attributed'] == 1]

In [None]:
print(len(train_1))
print(train_0.shape)
print(train.shape)
train['is_attributed'].value_counts()

In [None]:
# Undersample class 0 (without replacement)
train0_undersampled = resample(train_0, replace=False, n_samples=len(train_1), random_state=142) 

In [None]:
# Combine minority class with downsampled majority class
train_us = pd.concat([train0_undersampled, train_1])
 
# Display new class counts
train_us.is_attributed.value_counts()

### Feature Engineering

In [None]:
# Extract features from click_time
def ppClicktime(df):
    df['click_time'] = pd.to_datetime(df['click_time'])
    df['wday'] = df['click_time'].dt.dayofweek
    #df['week'] = df['click_time'].dt.week
    df['hour'] = df['click_time'].dt.hour
    #df['minute'] = df['click_time'].dt.minute
    return df
# Pre-process training (undersampled) and testing sets
#train_pp = ppClicktime(train_us)
# Pre-process training (full) and testing sets
train_pp = ppClicktime(train)
test_pp = ppClicktime(test)

In [None]:
# Drop click_time
train_pp.drop('click_time', axis = 1, inplace = True)
test_pp.drop('click_time', axis = 1, inplace = True)
print(len(test_pp))
test_pp.head()

In [None]:
# Write to csv
train_pp.to_csv("train_pp_50mil.csv",index=None)
#test_pp.to_csv("test_pp.csv",index=None)

### Feature aggregation

In [2]:
# Load in pre-processed files
PP_PATH = r"C:\Users\nguy3409\TalkingData-master"

def load_pp(pp_path=PP_PATH):
    # PATHS TO FILE
    train_pp = os.path.join(pp_path, "train_pp_50mil.csv")
    test_pp = os.path.join(pp_path, "test_pp.csv")
    return pd.read_csv(train_pp), pd.read_csv(test_pp)

train_pp, test_pp = load_pp()

In [3]:
# Drop attributed_time
train_pp.drop('attributed_time', axis = 1, inplace = True)

In [None]:
train_pp.head()

In [4]:
# Drop unnecessary features
def drop_ft(df):
    df.drop(['week','minute'],axis=1, inplace=True)
    return df
train_pp = drop_ft(train_pp)
test_pp = drop_ft(test_pp)

Adding new features

In [7]:
def aggregate_features(df):
    # IPs
    n_ip = df[['ip','channel']].groupby(by=['ip'])[['channel']].count().reset_index().rename(index = str, columns={'channel': 'n_ip'})
    df = df.merge(n_ip, on = ['ip'], how = 'left')
    # app count
    ip_app_count = df[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
    df = df.merge(ip_app_count, on = ['ip', 'app'], how = 'left')
    # device count
    ip_device_count = df[['ip','device', 'channel']].groupby(by=['ip', 'device'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_device_count'})
    df = df.merge(ip_device_count, on = ['ip', 'device'], how = 'left')
    # os count
    ip_os_count = df[['ip','os', 'channel']].groupby(by=['ip', 'os'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_os_count'})
    df = df.merge(ip_os_count, on = ['ip', 'os'], how = 'left')
    # wday + hour
    ip_wday_hour = df[['ip', 'wday', 'hour', 'channel']].groupby(by = ['ip','wday','hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_wday_hour'})
    df = df.merge(ip_wday_hour, on = ['ip', 'wday', 'hour'], how = 'left')
    # app + hour
    ip_app_hour = df[['ip', 'app', 'hour', 'channel']].groupby(by = ['ip','app','hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_app_hour'})
    df = df.merge(ip_app_hour, on = ['ip', 'app', 'hour'], how = 'left')
    # device + hour
    ip_device_hour = df[['ip', 'device', 'hour', 'channel']].groupby(by = ['ip','device','hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_device_hour'})
    df = df.merge(ip_device_hour, on = ['ip', 'device', 'hour'], how = 'left')
    # os + hour
    ip_os_hour = df[['ip', 'os', 'hour', 'channel']].groupby(by = ['ip','os','hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_os_hour'})
    df = df.merge(ip_os_hour, on = ['ip', 'os', 'hour'], how = 'left')
    # os + device + hour
    ip_os_device_hour = df[['ip', 'os', 'device', 'hour', 'channel']].groupby(by = ['ip','os', 'device', 'hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_os_device_hour'})
    df = df.merge(ip_os_device_hour, on = ['ip', 'os', 'device', 'hour'], how = 'left')
    # app + device + hour
    ip_app_device_hour = df[['ip', 'app', 'device', 'hour', 'channel']].groupby(by = ['ip','app', 'device', 'hour'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_app_device_hour'})
    df = df.merge(ip_app_device_hour, on = ['ip', 'app', 'device', 'hour'], how = 'left')
    # device + os
    ip_os_device = df[['ip', 'os', 'device', 'channel']].groupby(by = ['ip','os', 'device'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_os_device'})
    df = df.merge(ip_os_device, on = ['ip', 'os', 'device'], how = 'left')
    # app + device
    ip_app_device = df[['ip', 'app', 'device', 'channel']].groupby(by = ['ip','app', 'device'])[['channel']].count().reset_index().rename(index = str, columns = {'channel': 'ip_app_device'})
    df = df.merge(ip_app_device, on = ['ip', 'app', 'device'], how = 'left')
    return df

In [9]:
train_ag = aggregate_features(train_pp)

In [8]:
test_ag = aggregate_features(test_pp)

In [11]:
# Write to csv
#train_ag.to_csv("train_ag_50mil.csv",index=None)
test_ag.to_csv("test_ag.csv",index=None)

# Modeling

In [None]:
# Load in aggregated files
PP_PATH = r"C:\Users\nguy3409\TalkingData-master"

def load_ag(ag_path=PP_PATH):
    # PATHS TO FILE
    train_ag = os.path.join(ag_path, "train_ag_50mil.csv")
    test_ag = os.path.join(ag_path, "test_ag.csv")
    return pd.read_csv(train_ag), pd.read_csv(test_ag)

train_ag, test_ag = load_ag()

### Splitting training dataset

In [12]:
# Separate response variables from predictors
y = list(train_ag.is_attributed)
X = train_ag.drop(['is_attributed'],axis=1)
X.head()

Unnamed: 0,ip,app,device,os,channel,wday,hour,n_ip,ip_app_count,ip_device_count,ip_os_count,ip_wday_hour,ip_app_hour,ip_device_hour,ip_os_hour,ip_os_device_hour,ip_app_device_hour,ip_os_device,ip_app_device
0,83230,3,1,13,379,0,14,8126,1631,8084,2212,1,1,1,1,1,1,2199,1629
1,17357,3,1,19,379,0,14,6625,1356,6540,1644,1,1,1,1,1,1,1644,1354
2,35810,3,1,13,379,0,14,1835,327,1573,365,1,1,1,1,1,1,300,275
3,45745,14,1,13,478,0,14,40024,2310,35699,7812,1,1,1,1,1,1,7055,2260
4,161007,3,1,13,379,0,14,525,104,523,114,1,1,1,1,1,1,114,104


In [13]:
# Drop ip
X = X.drop(['ip'],axis=1)

In [None]:
train_ag.head()

In [14]:
# Split the training data into training and test sets for cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

### Logistic Regression

In [None]:
# Fit model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# Predict on test set
y_pred = logreg.predict(X_test)
y_pred_prob = logreg.predict_proba(X_test)
# AUC
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

### Random Forest

In [None]:
# Fit model
Ntree = 500
rfc = RandomForestClassifier(n_estimators=Ntree)
rfc.fit(X_train, y_train)
# Predict on test set
y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)
# AUC
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

### Light GBM

In [19]:
target = 'is_attributed'
predictors = ['device', 'app', 'os', 'channel', 'wday', 'hour',
              'n_ip', 'ip_app_count', 'ip_device_count', 'ip_os_count',
              'ip_wday_hour', 'ip_app_hour', 'ip_device_hour', 
              'ip_os_hour', 'ip_os_device_hour']
categorical = ['app', 'device', 'os', 'channel', 'wday', 'hour']

In [20]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 255,  
    'max_depth': 8,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight': 99.7
    }

In [21]:
dtrain = lgb.Dataset(X_train[predictors].values, label=y_train,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(X_test.values, label=y_test,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )

In [22]:
evals_results = {}
lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=350,
                 early_stopping_rounds=30,
                 verbose_eval=True, 
                 feval=None)

[1]	train's auc: 0.959651	valid's auc: 0.942052
Training until validation scores don't improve for 30 rounds.
[2]	train's auc: 0.963331	valid's auc: 0.928831
[3]	train's auc: 0.967055	valid's auc: 0.931711
[4]	train's auc: 0.968529	valid's auc: 0.933303
[5]	train's auc: 0.969338	valid's auc: 0.942857
[6]	train's auc: 0.969776	valid's auc: 0.946971
[7]	train's auc: 0.970897	valid's auc: 0.947145
[8]	train's auc: 0.971453	valid's auc: 0.947572
[9]	train's auc: 0.971779	valid's auc: 0.948006
[10]	train's auc: 0.971768	valid's auc: 0.950314
[11]	train's auc: 0.972249	valid's auc: 0.947376
[12]	train's auc: 0.972521	valid's auc: 0.947726
[13]	train's auc: 0.972932	valid's auc: 0.950185
[14]	train's auc: 0.973214	valid's auc: 0.950613
[15]	train's auc: 0.97347	valid's auc: 0.95106
[16]	train's auc: 0.973724	valid's auc: 0.951309
[17]	train's auc: 0.973902	valid's auc: 0.947925
[18]	train's auc: 0.974093	valid's auc: 0.948491
[19]	train's auc: 0.974169	valid's auc: 0.948873
[20]	train's auc: 

### XGBoost

In [None]:
!pip install xgboost
import xgboost as xgb

In [15]:
target = 'is_attributed'
predictors = ['device', 'app', 'os', 'channel', 'wday', 'hour',
              'n_ip', 'ip_app_count', 'ip_device_count', 'ip_os_count',
              'ip_wday_hour', 'ip_app_hour', 'ip_device_hour', 
              'ip_os_hour', 'ip_os_device_hour', 'ip_app_device_hour',
              'ip_os_device', 'ip_app_device']
categorical = ['app', 'device', 'os', 'channel', 'wday', 'hour']

In [16]:
params = {'eta': 0.3,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,  
          'max_depth': 0, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,
          'alpha':4,
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'nthread':8,
          'random_state': 99, 
          'silent': True}

In [17]:
dtrain = lgb.Dataset(X_train[predictors].values, label=y_train,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(X_test.values, label=y_test,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )

In [18]:
xgb_model = xgb.train(params, 
                      dtrain, 200, 
                      watchlist = [(dtrain, 'train'), (dvalid, 'valid')], 
                      maximize=True, 
                      early_stopping_rounds = 30, 
                      verbose_eval=5)

NameError: name 'xgb' is not defined

## Prediction

In [23]:
# Predict on test dataset and write out submission file
test2 = test_ag.drop(['click_id','ip'],axis=1)

In [None]:
test2.head()

In [None]:
logreg.fit(X,y)

In [24]:
#y_submit = logreg.predict(test2)
#y_submit = rfc.predict(test2)
y_submit = lgb_model.predict(test2[predictors],num_iteration=lgb_model.best_iteration)

In [25]:
test_ag['is_attributed'] = y_submit
ans = test_ag[['click_id', 'is_attributed']]

In [26]:
ans.to_csv('submission.csv', index=None)

In [None]:
ans.shape

## Feature Importance

From LightGBM model

In [None]:
# Plot the feature importance from lgb
plot_importance(lgb_model)
plt.gcf().savefig('feature_importance_lgb.png')