# A Lucky Tractor

***Xiaodong DENG*** (<a href='mailto:xd.deng.r@gmail.com'>xd.deng.r@gmail.com</a>)

This is one part of the 3rd ranking solution (out of 93 teams) to ***Asia Actuarial Analytics Challenge 2017*** hosted by *Singapore Actuarial Society* (https://www.kaggle.com/c/asia-actuarial-analytics-challenge-2017).


## Environment Setting-Up

In [1]:
from __future__ import division
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, auc, roc_auc_score
import xgboost as xgb
import time


os.chdir("../")

## Load Data

In [2]:
# Remove CustomerID since it's not "useful".
dat_train_raw =  pd.read_csv("data/SAS_Train_Data_v3.csv")
dat_train_raw = dat_train_raw[np.setdiff1d(dat_train_raw.columns, "CustomerID")]

In [3]:
print dat_train_raw.shape

(30000, 21)


In [4]:
dat_train_raw.dtypes

Age                              int64
Contact_Method                  object
Contact_Month                   object
Contact_WeekDay                 object
Credit_Default                  object
Education_Level                 object
Emp_Var_Rate                   float64
Euribor3m                      float64
Housing_Loan                    object
Job                             object
Marital_Status                  object
Monthly_CCI                    float64
Monthly_CPI                    float64
NoContacts_Current_Campaign      int64
NoContacts_PrevCampaigns         int64
No_Employed                    float64
Outcome                          int64
Pdays                            int64
Personal_Loan                   object
Prev_Outcome                    object
Sex                             object
dtype: object

In [5]:
print("Proportion of Positive Cases:{}%".format(round(dat_train_raw.Outcome.value_counts()[1]/dat_train_raw.shape[0] * 100, 2)))

Proportion of Positive Cases:11.95%


## Simple Feature Engineering

In [6]:
dat_train_raw.loc[dat_train_raw.NoContacts_PrevCampaigns==0, 'Pdays'] = 999
dat_train_raw.loc[dat_train_raw.NoContacts_PrevCampaigns==0, 'Prev_Outcome'] = 'nonexistent'
dat_train_raw.loc[dat_train_raw.Pdays==999, 'Prev_Outcome'] = 'nonexistent'
    

def FUN_categorical_to_numerical(dat, feature_name, label_column_name):
    '''
    This function is to convert categorical features into ordered features (then numerical features).
    We order the feature classes according to the proportion falling in outcome class 1.
    Then we split this predictor as if it were an ordered predictor.
    '''
    temp = dat[[feature_name, label_column_name]].pivot_table(index=feature_name, columns=label_column_name, aggfunc=len)[[0, 1]]
    temp['total_count'] = (temp[1] + temp[0])
    temp['proportion_of_positive'] = (temp[1]/temp['total_count'])
    temp = temp.sort_values("proportion_of_positive")
    temp['rank'] = range(temp.shape[0])
    
    mapping={}
    for i in zip(temp.index, temp['rank']):
        mapping[i[0]]=i[1]

    return mapping


# List the categorical variables
variables_to_encode = dat_train_raw.columns[dat_train_raw.dtypes=="object"]

# Note: the splitting mapping should be derived based on Train data and to be used on both Train and Test data.
for j in variables_to_encode:
    transform_mapping = FUN_categorical_to_numerical(dat_train_raw, j, "Outcome")
    dat_train_raw[j] = dat_train_raw[j].map(transform_mapping)

In [7]:
dat_train_raw.head()

Unnamed: 0,Age,Contact_Method,Contact_Month,Contact_WeekDay,Credit_Default,Education_Level,Emp_Var_Rate,Euribor3m,Housing_Loan,Job,...,Monthly_CCI,Monthly_CPI,NoContacts_Current_Campaign,NoContacts_PrevCampaigns,No_Employed,Outcome,Pdays,Personal_Loan,Prev_Outcome,Sex
0,49,0,0,4,1,0,1.1,4.857,1,1,...,-36.4,93.994,2,0,5191.0,0,999,1,0,0
1,44,1,1,0,1,2,1.4,4.958,1,4,...,-42.7,93.918,4,0,5228.1,0,999,1,0,0
2,40,0,0,3,1,0,1.1,4.857,2,1,...,-36.4,93.994,1,0,5191.0,0,999,1,0,0
3,48,0,0,3,1,0,1.1,4.856,2,3,...,-36.4,93.994,1,0,5191.0,0,999,1,0,1
4,34,1,0,2,1,3,-1.8,1.291,1,1,...,-46.2,92.893,1,0,5099.1,0,999,1,0,1


In [8]:
dat_train_raw.shape

(30000, 21)

## Tune the model

In [9]:
feature_columns = np.setdiff1d(dat_train_raw.columns, "Outcome")
temp_dtrain = xgb.DMatrix(dat_train_raw[feature_columns], label=dat_train_raw.Outcome)

In [10]:
cv_config = {}

cv_config['nfold'] = 5
cv_config["early_stopping_rounds"]=300
cv_config['num_boost_round'] = 10000

In [11]:
# initialize a hyperparameter set. It will be updated item by item later
final_param = {'objective':"binary:logistic",
                'eval_metric':'auc',
                "booster":"gbtree", 
                'eta':0.1,
                'max_depth':6,
                'subsample':1,
                'gamma':0,
                'colsample_bylevel':1}

In [12]:
candidates = {"eta":[0.1, 0.05, 0.01, 0.005], 
              "max_depth": [5, 6, 7],
              "subsample": [0.6, 0.8, 1],
              "gamma": [0, 0.1, 0.2],
              "colsample_bylevel": [0.3, 0.8, 1],
              "scale_pos_weight": [1, 2, 3]}

In [13]:
time_tuning_start = time.time()

for r in range(2):
    
    print("\n===== Round {} =====".format(r+1))
    
    for c in ['eta', 'max_depth', 'subsample', 'gamma', 'colsample_bylevel', 'scale_pos_weight']:

        temp_metric = 0
        param = final_param.copy()

        print("\n=== Tuning {} ===".format(c))

        for i in candidates[c]:

            param[c] = i

            bst_cv = xgb.cv(param, dtrain=temp_dtrain, nfold=cv_config['nfold'],
                            early_stopping_rounds= cv_config["early_stopping_rounds"],
                            num_boost_round= cv_config['num_boost_round'])
            print("Value: {}; test-auc-mean: {}; iteration: {}".format(i, bst_cv['test-auc-mean'].max(), bst_cv.shape[0]))

            if bst_cv['test-auc-mean'].max() >= temp_metric:
                final_param[c] = i
                temp_metric = bst_cv['test-auc-mean'].max()

        print(final_param)
    
time_tuning_end = time.time()


===== Round 1 =====

=== Tuning eta ===
Value: 0.1; test-auc-mean: 0.8003388; iteration: 47
Value: 0.05; test-auc-mean: 0.8000456; iteration: 111
Value: 0.01; test-auc-mean: 0.7998802; iteration: 541
Value: 0.005; test-auc-mean: 0.7997848; iteration: 1073
{'subsample': 1, 'eta': 0.1, 'eval_metric': 'auc', 'colsample_bylevel': 1, 'objective': 'binary:logistic', 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}

=== Tuning max_depth ===
Value: 5; test-auc-mean: 0.8008888; iteration: 80
Value: 6; test-auc-mean: 0.8003388; iteration: 47
Value: 7; test-auc-mean: 0.7986644; iteration: 52
{'subsample': 1, 'eta': 0.1, 'eval_metric': 'auc', 'colsample_bylevel': 1, 'objective': 'binary:logistic', 'max_depth': 5, 'gamma': 0, 'booster': 'gbtree'}

=== Tuning subsample ===
Value: 0.6; test-auc-mean: 0.799717; iteration: 53
Value: 0.8; test-auc-mean: 0.8009348; iteration: 64
Value: 1; test-auc-mean: 0.8008888; iteration: 80
{'subsample': 0.8, 'eta': 0.1, 'eval_metric': 'auc', 'colsample_bylevel': 1,

In [14]:
print("Tuning Costs {} seconds".format(int(time_tuning_end - time_tuning_start)))

Tuning Costs 1870 seconds
