In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce   # version 1.2.8
import numpy as np
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import *

In [34]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [35]:
import lightgbm as lgb
import tensorflow

# Read data

In [36]:
data = pd.read_csv('../data/carInsurance_train.csv')

In [37]:
from sklearn.model_selection import train_test_split

# Holding out 20% of the sample for test dataset

* Performing stratified sampling
* X_train, y_train - training dataset
* X_test, y_test - test dataset (hold-out)

In [38]:
X = data.drop('CarInsurance', axis=1)
target = data['CarInsurance']
X_train, X_test, y_train, y_test = train_test_split(X, target,  test_size=0.2, random_state=1)

# CallStart and CallEnd is converted to call duration

In [39]:
call_duration_arr = []
for index, row in X_train.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_train['call_duration'] = call_duration_arr

call_duration_arr = []
for index, row in X_test.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_test['call_duration'] = call_duration_arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Removing unneccessary columns - ID, CallStart, CallEnd, LastContactDay, Default

In [40]:

if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_train.columns)))<1:
    X_train = X_train.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)
    
if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_test.columns)))<1:
    X_test = X_test.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)

In [41]:
X_train.isnull().sum()

Age                    0
Job                   16
Marital                0
Education            130
Balance                0
HHInsurance            0
CarLoan                0
Communication        724
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             2434
call_duration          0
dtype: int64

In [42]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# Custom Transformer functions

* Imputation
* Log Transformation
* Normalization

In [43]:
class FillImputer(BaseEstimator, TransformerMixin):
    def __init__(self, fill_dict):
        super().__init__()
        self.fill_dict = fill_dict

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_

        X_copy = X.copy()
        X_copy = X_copy.fillna(self.fill_dict)

        return X_copy
    
class kNNImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.kneigh = KNeighborsClassifier(n_neighbors=3)
        self.ce_one_hot = ce.OneHotEncoder(cols = ['Job','Marital'])
        self.ce_one_hot_model = None

    def fit(self, X, y=None):
#         X = X.to_numpy()
#         self.means_ = X.mean(axis=0, keepdims=True)
#         self.std_ = X.std(axis=0, keepdims=True)
        imputation_features_non_missing = X[~X.Education.isnull()][['Job', 'Marital', 'Age']]
        education_non_missing = X[~X.Education.isnull()]['Education']
        
        self.ce_one_hot_model = self.ce_one_hot.fit(imputation_features_non_missing)
        imputation_features_non_missing_encoded = self.ce_one_hot_model.transform(imputation_features_non_missing)
        
        self.kneigh.fit(X=imputation_features_non_missing_encoded, y=education_non_missing)
    
        return self
    

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_
    
#         non_missing = X[~X.Education.isnull()]
#         missing = X[X.Education.isnull()]

        X_copy = X.copy()
    
        missing_ind = X_copy[X_copy.Education.isnull()].index
        education = X_copy.Education.copy()
       
        imputation_features_missing = X_copy[X_copy.Education.isnull()][['Job', 'Marital', 'Age']]
        education_missing = X_copy[X_copy.Education.isnull()]['Education']
                
        imputation_features_missing_encoded = self.ce_one_hot_model.transform(imputation_features_missing)
        missing_values_pred = self.kneigh.predict(imputation_features_missing_encoded)

        counter = 0
        for index in missing_ind:
            education[index] = missing_values_pred[counter]
            counter+=1
        
        X_copy['Education'] = education

        return X_copy

In [44]:
from sklearn.preprocessing import MinMaxScaler as min_max
from sklearn.preprocessing import StandardScaler

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        for col in self.columns:
            X_copy[col] = np.log(X_copy[col]+1)
        

        return X_copy

class MinMaxScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = min_max()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy
    
class StandardScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = StandardScaler()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy

In [45]:
def quarter_month(x):
    if x in ['jan', 'feb', 'mar', 'apr']:
        return '1'
    elif x in ['may', 'jun', 'jul', 'aug']:
        return '2'
    elif x in ['sep', 'oct', 'nov', 'dec']:
        return '3'
    
def job_categories(x):
    if x in ['blue-collar', 'entrepreneur', 'housemaid']:
        return 'job1'
    elif x in ['services', 'self-employed', 'admin.', 'technician', 'management', 'missing']:
        return 'job2'
    elif x in ['unemployed', 'retired', 'student']:
        return 'job3'
    else:
        return 'no_group'
    
def age_group(x):
    if x>=18 and x<=30:
        return '18-30'
    elif x>30 and x<=40:
        return '31-40'
    elif x>40 and x<=50:
        return '41-50'
    elif x>50 and x<=60:
        return '51-60'
    elif x>60:
        return '>60'
    
def duration_category(x):
    
    if (x>0) and (x<=10):
        return '0-10'
    elif (x>10) and (x<=20):
        return '10-20'
    elif (x>20) and (x<=30):
        return '20-30'
    elif (x>30):
        return '>30'
    
def no_of_contacts_category(x):
    
    
    if (x>0 and x<=3):
        return 'c1'
    elif (x>3 and x<=8):
        return 'c2'
    else:
        return 'c3'

# def duration_category(x):
    
#     if (x>0) and (x<=10):
#         return '0-10'
#     else: return '>10'
    

In [46]:
############### converted negative balance to zero
X_train['Balance'] = X_train.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_train['IsPreviouslyContacted'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_train['DaysPassed'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_train['LastContactQuadrimester'] = X_train.LastContactMonth.apply(lambda x : quarter_month(x))

X_train['Job'] = X_train['Job'].fillna('missing')
X_train['JobCategory'] = X_train.Job.apply(lambda x : job_categories(x))
X_train['AgeCategory'] = X_train.Age.apply(lambda x : age_group(x))
X_train['CallDurationCategory'] = X_train.call_duration.apply(lambda x : duration_category(x/60))
X_train['NoOfContactsCategory'] = X_train.NoOfContacts.apply(lambda x : no_of_contacts_category(x))


In [47]:
X_train.NoOfContactsCategory.unique()

array(['c1', 'c2', 'c3'], dtype=object)

# Create a sklearn pipeline for feature engineering - imputation, encoding, scaling

In [48]:
log_transformer = LogTransformer(columns=['call_duration'])
standard_scaler = StandardScalarMultiple(columns=['Age', 'Balance', 'NoOfContacts', 'PrevAttempts', 'DaysPassed'])
fill_imputer = FillImputer(fill_dict={'Communication': 'missing', 'Job' : 'missing', 'Outcome' : 'not_contacted'})
knn_imputer = kNNImputer()
one_hot_encoder = ce.OrdinalEncoder(cols=['NoOfContactsCategory','CallDurationCategory','JobCategory', 'AgeCategory', 'Marital', 'Education', 'Outcome', 'Communication', 'LastContactQuadrimester'])


pipe = Pipeline([('log_transformer', log_transformer), 
                 ('standard_scaler', standard_scaler),
                 ('fill_imputer', fill_imputer),
                 ('knn_imputer', knn_imputer),
                 ('one_hot_encoder', one_hot_encoder)])

pipe.fit(X_train)
X_train_features = pipe.transform(X_train)
X_train_features = X_train_features.drop(['NoOfContacts', 'LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [49]:
X_train_features[0:10]

Unnamed: 0,Marital,Education,HHInsurance,CarLoan,Communication,DaysPassed,PrevAttempts,Outcome,IsPreviouslyContacted,LastContactQuadrimester,JobCategory,AgeCategory,CallDurationCategory,NoOfContactsCategory
2996,1,1,0,0,1,-0.464205,-0.373862,1,0,1,1,1,1,1
3295,1,1,1,0,1,-0.464205,-0.373862,1,0,1,2,2,1,1
2284,2,2,1,0,2,-0.464205,-0.373862,1,0,1,1,3,1,1
2793,1,2,0,0,1,1.424987,2.276469,2,1,1,1,1,1,1
787,1,3,0,0,1,-0.464205,-0.373862,1,0,1,2,2,2,2
2046,1,3,0,0,1,-0.464205,-0.373862,1,0,2,3,4,2,1
739,3,1,1,0,2,-0.464205,-0.373862,1,0,1,1,1,1,2
2149,3,1,1,0,2,-0.464205,-0.373862,1,0,1,1,3,1,1
3369,2,2,0,0,1,-0.464205,-0.373862,1,0,1,1,1,1,3
3492,3,1,1,1,1,-0.464205,-0.373862,1,0,2,2,1,1,1


In [50]:
############### converted negative balance to zero
X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))

X_test['Job'] = X_test['Job'].fillna('missing')
X_test['JobCategory'] = X_test.Job.apply(lambda x : job_categories(x))
X_test['AgeCategory'] = X_test.Age.apply(lambda x : age_group(x))
X_test['CallDurationCategory'] = X_test.call_duration.apply(lambda x : duration_category(x/60))
X_test['NoOfContactsCategory'] = X_test.NoOfContacts.apply(lambda x : no_of_contacts_category(x))


X_test_features = pipe.transform(X_test)
X_test_features = X_test_features.drop(['NoOfContacts', 'LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [51]:
X_test_features[0:5]

Unnamed: 0,Marital,Education,HHInsurance,CarLoan,Communication,DaysPassed,PrevAttempts,Outcome,IsPreviouslyContacted,LastContactQuadrimester,JobCategory,AgeCategory,CallDurationCategory,NoOfContactsCategory
200,1,3,0,0,1,-0.464205,-0.373862,1,0,1,2,2,3,2
1078,1,2,0,0,1,-0.464205,-0.373862,1,0,1,1,1,1,1
610,1,1,0,1,1,-0.464205,-0.373862,1,0,1,2,2,1,2
2159,2,3,1,0,1,-0.464205,-0.373862,1,0,1,2,3,2,1
1169,1,1,0,0,1,-0.464205,-0.373862,1,0,1,1,2,1,2


# Divide the training dataset into two 
* The reason to do this is for performing early stopping

In [52]:
X_train_1, X_val, y_train_1, y_val = train_test_split(X_train_features, y_train,  test_size=0.2, random_state=1)

In [None]:
# 

In [53]:
lg = lgb.LGBMClassifier()

In [54]:
X_train_1.columns

Index(['Marital', 'Education', 'HHInsurance', 'CarLoan', 'Communication', 'DaysPassed', 'PrevAttempts', 'Outcome', 'IsPreviouslyContacted', 'LastContactQuadrimester', 'JobCategory', 'AgeCategory', 'CallDurationCategory', 'NoOfContactsCategory'], dtype='object')

In [55]:
categorical_cols = ['Marital', 'Education', 'HHInsurance', 'CarLoan', 'Communication', 'Outcome', 'IsPreviouslyContacted', 'LastContactQuadrimester', 'JobCategory', 'AgeCategory', 'CallDurationCategory', 'NoOfContactsCategory']

# LightGBM hyperparameter tuning

In [80]:
n_estimators = [10, 20, 50, 100]
max_depth = [5, 10, 15, 20]
learning_rate = [0.1, 0.01, 0.001]

models = []
accuracy_scores = []
precision_scores = []
recall_scores = []


for n_estimator in n_estimators:
    for depth in max_depth:
        for lr in learning_rate:
#             lg = lgb.LGBMClassifier(n_estimators=50, learning_rate=0.1, max_depth=5)
            lg = lgb.LGBMClassifier(n_estimators=n_estimator, learning_rate=lr, max_depth=depth)

            lg.fit(X_train_1, y_train_1, eval_set=(X_val, y_val),  early_stopping_rounds=30,
                  feature_name=list(X_train_1.columns), categorical_feature=categorical_cols
                  )
        
            pred = lg.predict(X_val)

            models.append('n_estimator:'+str(n_estimator)+', max_depth:'+ str(depth)+ ', learning_rate:'+ str(lr))
            accuracy_scores.append(accuracy_score(y_val, pred))
            precision_scores.append(precision_score(y_val, pred))
            recall_scores.append(recall_score(y_val, pred))
df = pd.DataFrame({'models': models, 'accuracy_scores': accuracy_scores, 'precision_scores' : precision_scores,
                   'recall_scores' : recall_scores})

New categorical_feature is ['AgeCategory', 'CallDurationCategory', 'CarLoan', 'Communication', 'Education', 'HHInsurance', 'IsPreviouslyContacted', 'JobCategory', 'LastContactQuadrimester', 'Marital', 'NoOfContactsCategory', 'Outcome']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
  _warn_prf(average, modifier, msg_start, len(result))


[1]	valid_0's binary_logloss: 0.641765
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.614318
[3]	valid_0's binary_logloss: 0.59114
[4]	valid_0's binary_logloss: 0.569239
[5]	valid_0's binary_logloss: 0.553363
[6]	valid_0's binary_logloss: 0.540015
[7]	valid_0's binary_logloss: 0.52601
[8]	valid_0's binary_logloss: 0.515323
[9]	valid_0's binary_logloss: 0.506423
[10]	valid_0's binary_logloss: 0.497014
Did not meet early stopping. Best iteration is:
[10]	valid_0's binary_logloss: 0.497014
[1]	valid_0's binary_logloss: 0.673059
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.669464
[3]	valid_0's binary_logloss: 0.666007
[4]	valid_0's binary_logloss: 0.662561
[5]	valid_0's binary_logloss: 0.659189
[6]	valid_0's binary_logloss: 0.655888
[7]	valid_0's binary_logloss: 0.652677
[8]	valid_0's binary_logloss: 0.649514
[9]	valid_0's binary_logloss: 0.646416
[10]	valid_0's binary_logloss: 0.64347
Did not mee

[1]	valid_0's binary_logloss: 0.672694
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.668711
[3]	valid_0's binary_logloss: 0.664819
[4]	valid_0's binary_logloss: 0.66101
[5]	valid_0's binary_logloss: 0.657262
[6]	valid_0's binary_logloss: 0.653611
[7]	valid_0's binary_logloss: 0.650046
[8]	valid_0's binary_logloss: 0.646544
[9]	valid_0's binary_logloss: 0.643103
[10]	valid_0's binary_logloss: 0.639756
[11]	valid_0's binary_logloss: 0.636478
[12]	valid_0's binary_logloss: 0.633241
[13]	valid_0's binary_logloss: 0.630094
[14]	valid_0's binary_logloss: 0.626941
[15]	valid_0's binary_logloss: 0.623952
[16]	valid_0's binary_logloss: 0.620922
[17]	valid_0's binary_logloss: 0.618039
[18]	valid_0's binary_logloss: 0.615169
[19]	valid_0's binary_logloss: 0.612383
[20]	valid_0's binary_logloss: 0.609564
Did not meet early stopping. Best iteration is:
[20]	valid_0's binary_logloss: 0.609564
[1]	valid_0's binary_logloss: 0.676327
Training until validat

[42]	valid_0's binary_logloss: 0.435976
[43]	valid_0's binary_logloss: 0.436076
[44]	valid_0's binary_logloss: 0.436304
[45]	valid_0's binary_logloss: 0.436485
[46]	valid_0's binary_logloss: 0.436745
[47]	valid_0's binary_logloss: 0.437494
[48]	valid_0's binary_logloss: 0.43797
[49]	valid_0's binary_logloss: 0.438186
[50]	valid_0's binary_logloss: 0.438975
Did not meet early stopping. Best iteration is:
[36]	valid_0's binary_logloss: 0.432993
[1]	valid_0's binary_logloss: 0.672694
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.668711
[3]	valid_0's binary_logloss: 0.664819
[4]	valid_0's binary_logloss: 0.66101
[5]	valid_0's binary_logloss: 0.657262
[6]	valid_0's binary_logloss: 0.653611
[7]	valid_0's binary_logloss: 0.650046
[8]	valid_0's binary_logloss: 0.646544
[9]	valid_0's binary_logloss: 0.643103
[10]	valid_0's binary_logloss: 0.639756
[11]	valid_0's binary_logloss: 0.636478
[12]	valid_0's binary_logloss: 0.633241
[13]	valid_0's binary_

[45]	valid_0's binary_logloss: 0.435297
[46]	valid_0's binary_logloss: 0.435587
[47]	valid_0's binary_logloss: 0.435244
[48]	valid_0's binary_logloss: 0.435384
[49]	valid_0's binary_logloss: 0.436744
[50]	valid_0's binary_logloss: 0.437884
Did not meet early stopping. Best iteration is:
[40]	valid_0's binary_logloss: 0.432041
[1]	valid_0's binary_logloss: 0.672694
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.668711
[3]	valid_0's binary_logloss: 0.664819
[4]	valid_0's binary_logloss: 0.66101
[5]	valid_0's binary_logloss: 0.657262
[6]	valid_0's binary_logloss: 0.653611
[7]	valid_0's binary_logloss: 0.650046
[8]	valid_0's binary_logloss: 0.646544
[9]	valid_0's binary_logloss: 0.643103
[10]	valid_0's binary_logloss: 0.639756
[11]	valid_0's binary_logloss: 0.636478
[12]	valid_0's binary_logloss: 0.633241
[13]	valid_0's binary_logloss: 0.630094
[14]	valid_0's binary_logloss: 0.626941
[15]	valid_0's binary_logloss: 0.623952
[16]	valid_0's binary

[47]	valid_0's binary_logloss: 0.66025
[48]	valid_0's binary_logloss: 0.659917
[49]	valid_0's binary_logloss: 0.659584
[50]	valid_0's binary_logloss: 0.659253
[51]	valid_0's binary_logloss: 0.658922
[52]	valid_0's binary_logloss: 0.658591
[53]	valid_0's binary_logloss: 0.658268
[54]	valid_0's binary_logloss: 0.657938
[55]	valid_0's binary_logloss: 0.657609
[56]	valid_0's binary_logloss: 0.657281
[57]	valid_0's binary_logloss: 0.656956
[58]	valid_0's binary_logloss: 0.656629
[59]	valid_0's binary_logloss: 0.656308
[60]	valid_0's binary_logloss: 0.655982
[61]	valid_0's binary_logloss: 0.655658
[62]	valid_0's binary_logloss: 0.655336
[63]	valid_0's binary_logloss: 0.655012
[64]	valid_0's binary_logloss: 0.654689
[65]	valid_0's binary_logloss: 0.654374
[66]	valid_0's binary_logloss: 0.654053
[67]	valid_0's binary_logloss: 0.653732
[68]	valid_0's binary_logloss: 0.653412
[69]	valid_0's binary_logloss: 0.653095
[70]	valid_0's binary_logloss: 0.652776
[71]	valid_0's binary_logloss: 0.652458
[

[29]	valid_0's binary_logloss: 0.432486
[30]	valid_0's binary_logloss: 0.432463
[31]	valid_0's binary_logloss: 0.432445
[32]	valid_0's binary_logloss: 0.432881
[33]	valid_0's binary_logloss: 0.432312
[34]	valid_0's binary_logloss: 0.43281
[35]	valid_0's binary_logloss: 0.432428
[36]	valid_0's binary_logloss: 0.432914
[37]	valid_0's binary_logloss: 0.434254
[38]	valid_0's binary_logloss: 0.434636
[39]	valid_0's binary_logloss: 0.435055
[40]	valid_0's binary_logloss: 0.434906
[41]	valid_0's binary_logloss: 0.43481
[42]	valid_0's binary_logloss: 0.434887
[43]	valid_0's binary_logloss: 0.434708
[44]	valid_0's binary_logloss: 0.434658
[45]	valid_0's binary_logloss: 0.434826
[46]	valid_0's binary_logloss: 0.435571
[47]	valid_0's binary_logloss: 0.435835
[48]	valid_0's binary_logloss: 0.436218
[49]	valid_0's binary_logloss: 0.437054
[50]	valid_0's binary_logloss: 0.438377
[51]	valid_0's binary_logloss: 0.438522
[52]	valid_0's binary_logloss: 0.439968
[53]	valid_0's binary_logloss: 0.441093
[5

[1]	valid_0's binary_logloss: 0.638316
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.607155
[3]	valid_0's binary_logloss: 0.581259
[4]	valid_0's binary_logloss: 0.560227
[5]	valid_0's binary_logloss: 0.542171
[6]	valid_0's binary_logloss: 0.526344
[7]	valid_0's binary_logloss: 0.512922
[8]	valid_0's binary_logloss: 0.501301
[9]	valid_0's binary_logloss: 0.491598
[10]	valid_0's binary_logloss: 0.483855
[11]	valid_0's binary_logloss: 0.477039
[12]	valid_0's binary_logloss: 0.47068
[13]	valid_0's binary_logloss: 0.466148
[14]	valid_0's binary_logloss: 0.460796
[15]	valid_0's binary_logloss: 0.456374
[16]	valid_0's binary_logloss: 0.452257
[17]	valid_0's binary_logloss: 0.448786
[18]	valid_0's binary_logloss: 0.445455
[19]	valid_0's binary_logloss: 0.443559
[20]	valid_0's binary_logloss: 0.441826
[21]	valid_0's binary_logloss: 0.440331
[22]	valid_0's binary_logloss: 0.439189
[23]	valid_0's binary_logloss: 0.438014
[24]	valid_0's binary_logloss

In [81]:
df.sort_values(by='accuracy_scores', ascending=False)[0:5]

Unnamed: 0,models,accuracy_scores,precision_scores,recall_scores
24,"n_estimator:50, max_depth:5, learning_rate:0.1",0.803125,0.785714,0.71374
36,"n_estimator:100, max_depth:5, learning_rate:0.1",0.798438,0.775934,0.71374
37,"n_estimator:100, max_depth:5, learning_rate:0.01",0.798438,0.834171,0.633588
0,"n_estimator:10, max_depth:5, learning_rate:0.1",0.798438,0.834171,0.633588
33,"n_estimator:50, max_depth:20, learning_rate:0.1",0.796875,0.757812,0.740458


# LightGBM fit the model with the best hyperparameters

In [82]:
lg = lgb.LGBMClassifier(n_estimators=50, learning_rate=0.1, max_depth=5)

lg.fit(X_train_1, y_train_1, eval_set=(X_val, y_val),  early_stopping_rounds=30,
                  feature_name=list(X_train_1.columns), categorical_feature=categorical_cols
                  )

[1]	valid_0's binary_logloss: 0.641765
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.614318
[3]	valid_0's binary_logloss: 0.59114
[4]	valid_0's binary_logloss: 0.569239
[5]	valid_0's binary_logloss: 0.553363
[6]	valid_0's binary_logloss: 0.540015
[7]	valid_0's binary_logloss: 0.52601
[8]	valid_0's binary_logloss: 0.515323
[9]	valid_0's binary_logloss: 0.506423
[10]	valid_0's binary_logloss: 0.497014
[11]	valid_0's binary_logloss: 0.491813
[12]	valid_0's binary_logloss: 0.484724
[13]	valid_0's binary_logloss: 0.479273
[14]	valid_0's binary_logloss: 0.474465
[15]	valid_0's binary_logloss: 0.470144
[16]	valid_0's binary_logloss: 0.466178
[17]	valid_0's binary_logloss: 0.462757
[18]	valid_0's binary_logloss: 0.459603
[19]	valid_0's binary_logloss: 0.456982
[20]	valid_0's binary_logloss: 0.45442
[21]	valid_0's binary_logloss: 0.452387
[22]	valid_0's binary_logloss: 0.450566
[23]	valid_0's binary_logloss: 0.448373
[24]	valid_0's binary_logloss: 

New categorical_feature is ['AgeCategory', 'CallDurationCategory', 'CarLoan', 'Communication', 'Education', 'HHInsurance', 'IsPreviouslyContacted', 'JobCategory', 'LastContactQuadrimester', 'Marital', 'NoOfContactsCategory', 'Outcome']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMClassifier(max_depth=5, n_estimators=50)

# Evaluation on both training and testing dataset

In [83]:
lgb_pred_train = lg.predict(X_train_features)

In [84]:
accuracy_score(y_train, lgb_pred_train)

0.816875

In [85]:
precision_score(y_train, lgb_pred_train)

0.7981418918918919

In [86]:
recall_score(y_train, lgb_pred_train)

0.7314241486068112

In [87]:
lgb_pred = lg.predict(X_test_features)

In [88]:
accuracy_score(y_test, lgb_pred)

0.8125

In [89]:
precision_score(y_test, lgb_pred)

0.7682119205298014

In [90]:
recall_score(y_test, lgb_pred)

0.7435897435897436

In [66]:
from sklearn.metrics import *

prob = lg.predict_proba(X_test_features)
pos_prob = [x[1] for x in prob]
fpr, tpr, thresholds = roc_curve(y_test, pos_prob, pos_label=1.0)
auc(fpr, tpr)

0.8672531788566625