In [112]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce   # version 1.2.8
import numpy as np
from sklearn.pipeline import Pipeline

In [113]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [114]:
import lightgbm as lgb
import tensorflow

# Read data

In [115]:
data = pd.read_csv('../data/carInsurance_train.csv')

In [116]:
from sklearn.model_selection import train_test_split

# Holding out 20% of the sample for test dataset

* Performing stratified sampling
* X_train, y_train - training dataset
* X_test, y_test - test dataset (hold-out)

In [117]:
X = data.drop('CarInsurance', axis=1)
target = data['CarInsurance']
X_train, X_test, y_train, y_test = train_test_split(X, target,  test_size=0.2, random_state=1)

# CallStart and CallEnd is converted to call duration

In [118]:
call_duration_arr = []
for index, row in X_train.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_train['call_duration'] = call_duration_arr

call_duration_arr = []
for index, row in X_test.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_test['call_duration'] = call_duration_arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Removing unneccessary columns - ID, CallStart, CallEnd, LastContactDay, Default

In [119]:

if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_train.columns)))<1:
    X_train = X_train.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)
    
if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_test.columns)))<1:
    X_test = X_test.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)

In [120]:
X_train.isnull().sum()

Age                    0
Job                   16
Marital                0
Education            130
Balance                0
HHInsurance            0
CarLoan                0
Communication        724
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             2434
call_duration          0
dtype: int64

In [121]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# Custom Transformer functions

* Imputation
* Log Transformation
* Normalization

In [122]:
class FillImputer(BaseEstimator, TransformerMixin):
    def __init__(self, fill_dict):
        super().__init__()
        self.fill_dict = fill_dict

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_

        X_copy = X.copy()
        X_copy = X_copy.fillna(self.fill_dict)

        return X_copy
    
class kNNImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.kneigh = KNeighborsClassifier(n_neighbors=3)
        self.ce_one_hot = ce.OneHotEncoder(cols = ['Job','Marital'])
        self.ce_one_hot_model = None

    def fit(self, X, y=None):
#         X = X.to_numpy()
#         self.means_ = X.mean(axis=0, keepdims=True)
#         self.std_ = X.std(axis=0, keepdims=True)
        imputation_features_non_missing = X[~X.Education.isnull()][['Job', 'Marital', 'Age']]
        education_non_missing = X[~X.Education.isnull()]['Education']
        
        self.ce_one_hot_model = self.ce_one_hot.fit(imputation_features_non_missing)
        imputation_features_non_missing_encoded = self.ce_one_hot_model.transform(imputation_features_non_missing)
        
        self.kneigh.fit(X=imputation_features_non_missing_encoded, y=education_non_missing)
    
        return self
    

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_
    
#         non_missing = X[~X.Education.isnull()]
#         missing = X[X.Education.isnull()]

        X_copy = X.copy()
    
        missing_ind = X_copy[X_copy.Education.isnull()].index
        education = X_copy.Education.copy()
       
        imputation_features_missing = X_copy[X_copy.Education.isnull()][['Job', 'Marital', 'Age']]
        education_missing = X_copy[X_copy.Education.isnull()]['Education']
                
        imputation_features_missing_encoded = self.ce_one_hot_model.transform(imputation_features_missing)
        missing_values_pred = self.kneigh.predict(imputation_features_missing_encoded)

        counter = 0
        for index in missing_ind:
            education[index] = missing_values_pred[counter]
            counter+=1
        
        X_copy['Education'] = education

        return X_copy

In [123]:
from sklearn.preprocessing import MinMaxScaler as min_max
from sklearn.preprocessing import StandardScaler

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        for col in self.columns:
            X_copy[col] = np.log(X_copy[col]+1)
        

        return X_copy

class MinMaxScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = min_max()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy
    
class StandardScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = StandardScaler()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy

In [124]:
def quarter_month(x):
    if x in ['jan', 'feb', 'mar', 'apr']:
        return '1'
    elif x in ['may', 'jun', 'jul', 'aug']:
        return '2'
    elif x in ['sep', 'oct', 'nov', 'dec']:
        return '3'

In [125]:
############### converted negative balance to zero
X_train['Balance'] = X_train.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_train['IsPreviouslyContacted'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_train['DaysPassed'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_train['LastContactQuadrimester'] = X_train.LastContactMonth.apply(lambda x : quarter_month(x))


# Create a sklearn pipeline for feature engineering - imputation, encoding, scaling

In [126]:
log_transformer = LogTransformer(columns=['call_duration'])
standard_scaler = StandardScalarMultiple(columns=['Age', 'Balance', 'NoOfContacts', 'PrevAttempts', 'DaysPassed'])
fill_imputer = FillImputer(fill_dict={'Communication': 'missing', 'Job' : 'management', 'Outcome' : 'not_contacted'})
knn_imputer = kNNImputer()
one_hot_encoder = ce.OneHotEncoder(cols=['Job', 'Marital', 'Education', 'Outcome', 'Communication', 'LastContactQuadrimester'])


pipe = Pipeline([('log_transformer', log_transformer), 
                 ('standard_scaler', standard_scaler),
                 ('fill_imputer', fill_imputer),
                 ('knn_imputer', knn_imputer),
                 ('one_hot_encoder', one_hot_encoder)])

pipe.fit(X_train)
X_train_features = pipe.transform(X_train)
X_train_features = X_train_features.drop('LastContactMonth', axis=1)

In [127]:
X_train_features[0:10]

Unnamed: 0,Age,Job_1,Job_2,Job_3,Job_4,Job_5,Job_6,Job_7,Job_8,Job_9,Job_10,Job_11,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,Balance,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,NoOfContacts,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,call_duration,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3
2996,-0.367912,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,-0.423515,0,0,1,0,0,-0.196231,-0.464205,-0.373862,1,0,0,0,5.09375,0,1,0,0
3295,1.183915,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0.431397,1,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,5.746203,0,1,0,0
2284,-0.023062,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-0.347487,1,0,0,1,0,-0.521084,-0.464205,-0.373862,1,0,0,0,4.330733,0,1,0,0
2793,-0.367912,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0.55394,0,0,1,0,0,-0.196231,1.424987,2.276469,0,1,0,0,5.676754,1,1,0,0
787,1.528765,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,-0.484145,0,0,1,0,0,1.103179,-0.464205,-0.373862,1,0,0,0,7.050123,0,1,0,0
2046,2.477104,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.729093,0,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,6.428105,0,0,1,0
739,-0.454125,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,-0.484145,1,0,0,1,0,1.752885,-0.464205,-0.373862,1,0,0,0,4.454347,0,1,0,0
2149,0.149363,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,-0.367056,1,0,0,1,0,-0.196231,-0.464205,-0.373862,1,0,0,0,6.008813,0,1,0,0
3369,-0.62655,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0.032973,0,0,1,0,0,6.300822,-0.464205,-0.373862,1,0,0,0,4.691348,0,1,0,0
3492,-0.454125,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,-0.383737,1,1,1,0,0,-0.196231,-0.464205,-0.373862,1,0,0,0,5.57973,0,0,1,0


# Apply pipeline on test dataset

In [128]:
X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)
X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)
X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)
X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))

# X_test_1 = log_transformer.transform(X_test)
# X_test_2 = standard_scaler.transform(X_test_1)
# X_test_3 = fill_imputer.transform(X_test_2)
# X_test_4 = knn_imputer.transform(X_test_3)
# X_test_5 = one_hot_encoder.transform(X_test_4)

X_test_features = pipe.transform(X_test)
X_test_features = X_test_features.drop('LastContactMonth', axis=1)

In [129]:
X_test_features[0:5]

Unnamed: 0,Age,Job_1,Job_2,Job_3,Job_4,Job_5,Job_6,Job_7,Job_8,Job_9,Job_10,Job_11,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,Balance,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,NoOfContacts,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,call_duration,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3
200,1.270127,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2.134486,0,0,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,7.116394,0,1,0,0
1078,-0.798975,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,-0.416779,0,0,1,0,0,0.128621,-0.464205,-0.373862,1,0,0,0,4.804021,0,1,0,0
610,1.528765,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,-0.136085,0,1,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,4.624973,0,1,0,0
2159,0.149363,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,-0.227511,1,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,6.805723,0,1,0,0
1169,1.270127,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,-0.133519,0,0,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,4.49981,0,1,0,0


# Fit the logistic regression model on train dataset and predict on test dataset

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [131]:
lg = LogisticRegression(max_iter=500)
lg.fit(X_train_features, y_train)

LogisticRegression(max_iter=500)

In [132]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'C': [0.001, 0.01, 0.1, 1]
    }
search = GridSearchCV(lg, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=8).fit(X_train_features, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  10 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  12 out of  12 | elapsed:    0.2s finished


In [133]:
lg = search.best_estimator_
lg

LogisticRegression(C=0.1, max_iter=500)

In [134]:
lg_pred_train = lg.predict(X_train_features)

In [135]:
accuracy_score(y_train, lg_pred_train)

0.815625

In [136]:
precision_score(y_train, lg_pred_train)

0.7790143084260731

In [137]:
recall_score(y_train, lg_pred_train)

0.7585139318885449

In [138]:
lg_pred = lg.predict(X_test_features)

In [139]:
accuracy_score(y_test, lg_pred)

0.805

In [140]:
precision_score(y_test, lg_pred)

0.7516129032258064

In [141]:
recall_score(y_test, lg_pred)

0.7467948717948718

# Fit the random forest model on train dataset and predict on test dataset

In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [143]:
rf = RandomForestClassifier()

In [144]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [5, 10, 20, 30, 50],
   'max_depth': [5, 10, 15, 20],
   'min_samples_split': [20, 30, 50, 70, 100]
    }
search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=8).fit(X_train_features, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 285 out of 300 | elapsed:    4.2s remaining:    0.2s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    4.4s finished


In [145]:
rf = search.best_estimator_
rf

RandomForestClassifier(max_depth=15, min_samples_split=30, n_estimators=30)

In [146]:
rf = RandomForestClassifier(n_estimators=50, max_depth=5)
rf.fit(X_train_features, y_train)

RandomForestClassifier(max_depth=5, n_estimators=50)

In [147]:
rf.feature_importances_

array([3.94138347e-02, 6.05576056e-04, 3.61108412e-03, 1.74030896e-03,
       1.43385947e-03, 4.46844173e-03, 3.40154121e-04, 1.32204383e-03,
       1.69021408e-03, 1.11182607e-03, 2.65139195e-03, 1.40331971e-03,
       9.62613468e-03, 6.06099624e-03, 1.56702537e-03, 2.69940748e-03,
       1.99369043e-03, 1.83488486e-03, 1.73248221e-02, 3.31264527e-02,
       6.16214042e-03, 3.36983540e-02, 7.06661384e-02, 1.91403779e-03,
       8.61889551e-03, 3.45316731e-02, 2.16242080e-02, 1.74208393e-02,
       2.53356502e-03, 1.17819287e-01, 8.47279290e-03, 4.84037117e-01,
       2.49615747e-02, 2.16039522e-02, 8.23565779e-03, 3.67429881e-03])

In [148]:
feature_imp_arr = pd.DataFrame({'columns': X_train_features.columns, 'feature_importances': rf.feature_importances_})

In [149]:
# feature_imp_arr.sort_values(by='feature_importances', ascending=False)

In [150]:
rf_pred_train = rf.predict(X_train_features)

In [151]:
accuracy_score(y_train, rf_pred_train)

0.8325

In [152]:
precision_score(y_train, rf_pred_train)

0.8253012048192772

In [153]:
recall_score(y_train, rf_pred_train)

0.7422600619195047

In [154]:
rf_pred = rf.predict(X_test_features)

In [155]:
accuracy_score(y_test, rf_pred)

0.805

In [156]:
precision_score(y_test, rf_pred)

0.7805755395683454

In [157]:
recall_score(y_test, rf_pred)

0.6955128205128205

# Fit the gradient boosting model on train dataset and predict on test dataset

In [158]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [159]:
gd = GradientBoostingClassifier()

In [160]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [5, 10, 20, 30, 50],
   'max_depth': [5, 10, 15, 20],
   'min_samples_split': [20, 30, 50, 70, 100]
    }
search = GridSearchCV(gd, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=8).fit(X_train_features, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 256 tasks      | elapsed:   15.4s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:   19.1s finished


In [161]:
gb = search.best_estimator_
gb

GradientBoostingClassifier(max_depth=5, min_samples_split=20, n_estimators=50)

In [162]:
gb_pred_train = gb.predict(X_train_features)

In [163]:
accuracy_score(y_train, gb_pred_train)

0.87625

In [164]:
precision_score(y_train, gb_pred_train)

0.8318518518518518

In [165]:
recall_score(y_train, gb_pred_train)

0.8691950464396285

In [166]:
gb_pred = gb.predict(X_test_features)

In [167]:
accuracy_score(y_test, gb_pred)

0.8275

In [168]:
precision_score(y_test, gb_pred)

0.7636363636363637

In [169]:
recall_score(y_test, gb_pred)

0.8076923076923077

In [170]:
X_test_copy = X_test.copy()
X_test_copy['target'] = y_test
X_test_copy[X_test_copy.Job=='student']

Unnamed: 0,Age,Job,Marital,Education,Balance,HHInsurance,CarLoan,Communication,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,call_duration,IsPreviouslyContacted,LastContactQuadrimester,target
3195,19,student,single,secondary,329,0,0,cellular,apr,1,190,2,success,169.0,1,1,1
2725,35,student,single,secondary,188,1,0,cellular,feb,3,200,5,failure,75.0,1,1,1
1432,30,student,single,tertiary,7513,0,0,cellular,aug,2,0,0,,269.0,0,2,0
1508,27,student,single,tertiary,311,0,0,cellular,jul,1,148,4,success,462.0,1,2,1
1690,18,student,single,,108,0,0,cellular,feb,1,183,1,success,92.0,1,1,1
2681,27,student,single,secondary,671,1,0,,may,2,0,0,,332.0,0,2,0
2313,27,student,single,,910,1,1,,may,1,0,0,,77.0,0,2,0
3707,39,student,single,tertiary,2103,0,0,cellular,feb,1,0,0,,588.0,0,1,1
796,33,student,single,,1170,0,0,cellular,apr,1,0,0,,352.0,0,1,1
2350,24,student,single,secondary,23878,0,0,cellular,feb,1,0,0,,185.0,0,1,1
