In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce   # version 1.2.8
import numpy as np
from sklearn.pipeline import Pipeline

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
import lightgbm as lgb
import tensorflow

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Read data

In [4]:
data = pd.read_csv('../data/carInsurance_train.csv')

In [5]:
from sklearn.model_selection import train_test_split

# Holding out 20% of the sample for test dataset

* Performing stratified sampling
* X_train, y_train - training dataset
* X_test, y_test - test dataset (hold-out)

In [6]:
X = data.drop('CarInsurance', axis=1)
target = data['CarInsurance']
X_train, X_test, y_train, y_test = train_test_split(X, target,  test_size=0.2, random_state=1)

# CallStart and CallEnd is converted to call duration

In [7]:
call_duration_arr = []
for index, row in X_train.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_train['call_duration'] = call_duration_arr

call_duration_arr = []
for index, row in X_test.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_test['call_duration'] = call_duration_arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Removing unneccessary columns - ID, CallStart, CallEnd, LastContactDay, Default

In [8]:

if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_train.columns)))<1:
    X_train = X_train.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)
    
if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_test.columns)))<1:
    X_test = X_test.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)

In [9]:
X_train.isnull().sum()

Age                    0
Job                   16
Marital                0
Education            130
Balance                0
HHInsurance            0
CarLoan                0
Communication        724
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             2434
call_duration          0
dtype: int64

In [10]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# Custom Transformer functions

* Imputation
* Log Transformation
* Normalization

In [11]:
class FillImputer(BaseEstimator, TransformerMixin):
    def __init__(self, fill_dict):
        super().__init__()
        self.fill_dict = fill_dict

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_

        X_copy = X.copy()
        X_copy = X_copy.fillna(self.fill_dict)

        return X_copy
    
class kNNImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.kneigh = KNeighborsClassifier(n_neighbors=3)
        self.ce_one_hot = ce.OneHotEncoder(cols = ['Job','Marital'])
        self.ce_one_hot_model = None

    def fit(self, X, y=None):
#         X = X.to_numpy()
#         self.means_ = X.mean(axis=0, keepdims=True)
#         self.std_ = X.std(axis=0, keepdims=True)
        imputation_features_non_missing = X[~X.Education.isnull()][['Job', 'Marital', 'Age']]
        education_non_missing = X[~X.Education.isnull()]['Education']
        
        self.ce_one_hot_model = self.ce_one_hot.fit(imputation_features_non_missing)
        imputation_features_non_missing_encoded = self.ce_one_hot_model.transform(imputation_features_non_missing)
        
        self.kneigh.fit(X=imputation_features_non_missing_encoded, y=education_non_missing)
    
        return self
    

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_
    
#         non_missing = X[~X.Education.isnull()]
#         missing = X[X.Education.isnull()]

        X_copy = X.copy()
    
        missing_ind = X_copy[X_copy.Education.isnull()].index
        education = X_copy.Education.copy()
       
        imputation_features_missing = X_copy[X_copy.Education.isnull()][['Job', 'Marital', 'Age']]
        education_missing = X_copy[X_copy.Education.isnull()]['Education']
                
        imputation_features_missing_encoded = self.ce_one_hot_model.transform(imputation_features_missing)
        missing_values_pred = self.kneigh.predict(imputation_features_missing_encoded)

        counter = 0
        for index in missing_ind:
            education[index] = missing_values_pred[counter]
            counter+=1
        
        X_copy['Education'] = education

        return X_copy

In [12]:
from sklearn.preprocessing import MinMaxScaler as min_max
from sklearn.preprocessing import StandardScaler

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        for col in self.columns:
            X_copy[col] = np.log(X_copy[col]+1)
        

        return X_copy

class MinMaxScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = min_max()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy
    
class StandardScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = StandardScaler()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy

In [13]:
def quarter_month(x):
    if x in ['jan', 'feb', 'mar', 'apr']:
        return '1'
    elif x in ['may', 'jun', 'jul', 'aug']:
        return '2'
    elif x in ['sep', 'oct', 'nov', 'dec']:
        return '3'
    
def job_categories(x):
    if x in ['blue-collar', 'entrepreneur', 'housemaid']:
        return 'job1'
    elif x in ['services', 'self-employed', 'admin.', 'technician', 'management', 'missing']:
        return 'job2'
    elif x in ['unemployed', 'retired', 'student']:
        return 'job3'
    else:
        return 'no_group'
    
def age_group(x):
    if x>=18 and x<=30:
        return '18-30'
    elif x>30 and x<=40:
        return '31-40'
    elif x>40 and x<=50:
        return '41-50'
    elif x>50 and x<=60:
        return '51-60'
    elif x>60:
        return '>60'
    
def duration_category(x):
    
    if (x>0) and (x<=10):
        return '0-10'
    elif (x>10) and (x<=20):
        return '10-20'
    elif (x>20) and (x<=30):
        return '20-30'
    elif (x>30):
        return '>30'
    
def no_of_contacts_category(x):
    
    
    if (x>0 and x<=3):
        return 'c1'
    elif (x>3 and x<=8):
        return 'c2'
    else:
        return 'c3'

# def duration_category(x):
    
#     if (x>0) and (x<=10):
#         return '0-10'
#     else: return '>10'
    

In [14]:
############### converted negative balance to zero
X_train['Balance'] = X_train.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_train['IsPreviouslyContacted'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_train['DaysPassed'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_train['LastContactQuadrimester'] = X_train.LastContactMonth.apply(lambda x : quarter_month(x))

X_train['Job'] = X_train['Job'].fillna('missing')
X_train['JobCategory'] = X_train.Job.apply(lambda x : job_categories(x))
X_train['AgeCategory'] = X_train.Age.apply(lambda x : age_group(x))
X_train['CallDurationCategory'] = X_train.call_duration.apply(lambda x : duration_category(x/60))
X_train['NoOfContactsCategory'] = X_train.NoOfContacts.apply(lambda x : no_of_contacts_category(x))


In [15]:
X_train.NoOfContactsCategory.unique()

array(['c1', 'c2', 'c3'], dtype=object)

# Create a sklearn pipeline for feature engineering - imputation, encoding, scaling

In [16]:
log_transformer = LogTransformer(columns=['call_duration'])
standard_scaler = StandardScalarMultiple(columns=['Age', 'Balance', 'NoOfContacts', 'PrevAttempts', 'DaysPassed'])
fill_imputer = FillImputer(fill_dict={'Communication': 'missing', 'Job' : 'missing', 'Outcome' : 'not_contacted'})
knn_imputer = kNNImputer()
one_hot_encoder = ce.OneHotEncoder(cols=['NoOfContactsCategory','CallDurationCategory','JobCategory', 'AgeCategory', 'Marital', 'Education', 'Outcome', 'Communication', 'LastContactQuadrimester'])


pipe = Pipeline([('log_transformer', log_transformer), 
                 ('standard_scaler', standard_scaler),
                 ('fill_imputer', fill_imputer),
                 ('knn_imputer', knn_imputer),
                 ('one_hot_encoder', one_hot_encoder)])

pipe.fit(X_train)
X_train_features = pipe.transform(X_train)
X_train_features = X_train_features.drop(['NoOfContacts', 'LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [17]:
X_train_features[0:10]

Unnamed: 0,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3,JobCategory_1,JobCategory_2,JobCategory_3,AgeCategory_1,AgeCategory_2,AgeCategory_3,AgeCategory_4,AgeCategory_5,CallDurationCategory_1,CallDurationCategory_2,CallDurationCategory_3,CallDurationCategory_4,NoOfContactsCategory_1,NoOfContactsCategory_2,NoOfContactsCategory_3
2996,1,0,0,1,0,0,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0
3295,1,0,0,1,0,0,1,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0
2284,0,1,0,0,1,0,1,0,0,1,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
2793,1,0,0,0,1,0,0,0,1,0,0,1.424987,2.276469,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0
787,1,0,0,0,0,1,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
2046,1,0,0,0,0,1,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0
739,0,0,1,1,0,0,1,0,0,1,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0
2149,0,0,1,1,0,0,1,0,0,1,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
3369,0,1,0,0,1,0,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1
3492,0,0,1,1,0,0,1,1,1,0,0,-0.464205,-0.373862,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0


# Create a logistic regression model and performing cross validation using training dataset

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

In [19]:
lg = LogisticRegression(max_iter=500)

In [25]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'C': [0.001, 0.01, 0.1, 1]
    }
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)
search = GridSearchCV(lg, param_grid, cv=kfold, scoring='accuracy', verbose=1).fit(X_train_features, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.6s finished


In [26]:
search.best_estimator_

LogisticRegression(C=1, max_iter=500)

In [28]:
search.cv_results_

{'mean_fit_time': array([0.12554979, 0.01365956, 0.02231868, 0.03621626]),
 'std_fit_time': array([0.1604686 , 0.0005213 , 0.00064338, 0.00281851]),
 'mean_score_time': array([0.00296871, 0.00243076, 0.00228842, 0.00242201]),
 'std_score_time': array([5.53898181e-04, 3.04020825e-05, 5.71652429e-06, 4.15634699e-05]),
 'param_C': masked_array(data=[0.001, 0.01, 0.1, 1],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.001}, {'C': 0.01}, {'C': 0.1}, {'C': 1}],
 'split0_test_score': array([0.66260544, 0.78163074, 0.79943768, 0.80131209]),
 'split1_test_score': array([0.65229616, 0.76101218, 0.78537957, 0.7900656 ]),
 'split2_test_score': array([0.63977486, 0.74765478, 0.77579737, 0.77861163]),
 'mean_test_score': array([0.65155882, 0.76343257, 0.78687154, 0.78999644]),
 'std_test_score': array([0.00933512, 0.01397581, 0.0097086 , 0.00926755]),
 'rank_test_score': array([4, 3, 2, 1], dtype=int32)}

In [60]:
cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='accuracy')

array([0.80131209, 0.7900656 , 0.77861163])

In [63]:
np.mean(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='accuracy'))

0.7899964422468823

In [64]:
np.std(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='accuracy'))

0.009267552087062812

In [61]:
cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='precision')

array([0.82686567, 0.7965616 , 0.77094972])

In [65]:
np.mean(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='precision'))

0.7981256656322365

In [67]:
np.std(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='precision'))

0.022854366591962588

In [62]:
cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='recall')

array([0.64269142, 0.6450116 , 0.64186047])

In [68]:
np.mean(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='recall'))

0.6431878271191928

In [69]:
np.std(cross_val_score(LogisticRegression(C=1, max_iter=500), X_train_features, y_train, cv=kfold, scoring='recall'))

0.0013334748433879

# Create a random forest model and performing cross validation using training dataset

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [30]:
rf = RandomForestClassifier()

In [31]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [5, 10, 20, 30, 50],
   'max_depth': [5, 10, 15, 20],
   'min_samples_split': [20, 30, 50, 70, 100]
    }
search = GridSearchCV(rf, param_grid, cv=kfold, scoring='accuracy', verbose=1, n_jobs=-1).fit(X_train_features, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 285 out of 300 | elapsed:    4.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.1s finished


In [32]:
search.best_estimator_

RandomForestClassifier(max_depth=15, min_samples_split=20, n_estimators=50)

In [34]:
search.cv_results_

{'mean_fit_time': array([0.16650565, 0.08850074, 0.08676346, 0.11090302, 0.16735848,
        0.02290948, 0.03607376, 0.0682656 , 0.09455522, 0.15596644,
        0.0215021 , 0.03594033, 0.06527376, 0.0946389 , 0.17840552,
        0.02253111, 0.03564755, 0.06888207, 0.10276063, 0.16561484,
        0.02933486, 0.03710055, 0.06604894, 0.09430957, 0.15201521,
        0.02286617, 0.04375688, 0.07468748, 0.10799432, 0.17381279,
        0.02251554, 0.03958734, 0.07230377, 0.10559956, 0.16882173,
        0.02172375, 0.03782837, 0.06917842, 0.10086354, 0.16517862,
        0.02178176, 0.03790339, 0.06849575, 0.09832899, 0.16105851,
        0.02302694, 0.03631973, 0.066751  , 0.09790389, 0.1585168 ,
        0.02297457, 0.04007292, 0.07441608, 0.11047459, 0.180094  ,
        0.02287579, 0.03883457, 0.07351557, 0.1075712 , 0.17293421,
        0.02215632, 0.03851088, 0.07072147, 0.10251943, 0.16765475,
        0.02188897, 0.03728787, 0.06904475, 0.10008144, 0.16391587,
        0.02124357, 0.0362529 ,

In [37]:
search.best_score_

0.8065590431695536

In [54]:
rf = RandomForestClassifier(max_depth=15, min_samples_split=20, n_estimators=50)

In [55]:
cross_val_score(search.best_estimator_, X_train_features, y_train, cv=kfold, scoring='accuracy')

array([0.80787254, 0.79850047, 0.79174484])

In [70]:
print(np.mean(cross_val_score(search.best_estimator_, X_train_features, y_train, cv=kfold, scoring='accuracy')))

0.7993726163200642


In [71]:
print(np.std(cross_val_score(search.best_estimator_, X_train_features, y_train, cv=kfold, scoring='accuracy')))

0.00661292433827033


In [78]:
score = cross_val_score(search.best_estimator_, X_train_features, y_train, cv=kfold, scoring='precision')

In [79]:
print(np.mean(score))

0.7921855921855921


In [80]:
print(np.std(score))

0.018593928406149166


In [84]:
score = cross_val_score(search.best_estimator_, X_train_features, y_train, cv=kfold, scoring='recall')

In [85]:
print(np.mean(score))

0.683445745427076


In [86]:
print(np.std(score))

0.010155853716560201


# Create a gradient boosting model and performing cross validation using training dataset

In [45]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [46]:
gd = GradientBoostingClassifier()

In [47]:
gd_precision_scores = cross_val_score(gd, X_train_features, y_train, cv=kfold, scoring='precision')

In [48]:
gd_precision_scores

array([0.8032345 , 0.80779944, 0.77142857])

In [49]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [5, 10, 20, 30, 50],
   'max_depth': [5, 10, 15, 20],
   'min_samples_split': [20, 30, 50, 70, 100],
   'learning_rate': [0.001, 0.01, 0.1]
    }
search = GridSearchCV(gd, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1).fit(X_train_features, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   42.7s finished


In [50]:
search.best_estimator_

GradientBoostingClassifier(max_depth=5, min_samples_split=50, n_estimators=50)

In [51]:
search.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'min_samples_split': 50,
 'n_estimators': 50}

In [52]:
gb = GradientBoostingClassifier(max_depth=5, min_samples_split=50, n_estimators=50, learning_rate=0.1)

In [96]:
score = cross_val_score(gb, X_train_features, y_train, cv=kfold, scoring='accuracy')

In [97]:
np.mean(score)

0.7999974210685803

In [98]:
np.std(score)

0.006589513349395979

In [99]:
score = cross_val_score(gb, X_train_features, y_train, cv=kfold, scoring='precision')

In [100]:
np.mean(score)

0.7920014329105238

In [101]:
np.std(score)

0.018523665520721152

In [102]:
score = cross_val_score(gb, X_train_features, y_train, cv=kfold, scoring='recall')

In [103]:
np.mean(score)

0.6849925358369755

In [104]:
np.std(score)

0.00824511082340141

# Apply pipeline on test dataset

In [24]:
# X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)
# X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)
# X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)
# X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))
# X_test['Job'] = X_test['Job'].fillna('missing')
# X_test['JobCategory'] = X_test.Job.apply(lambda x : job_categories(x))
# X_test['AgeCategory'] = X_test.Age.apply(lambda x : age_group(x))
# X_train['CallDurationCategory'] = X_train.call_duration.apply(lambda x : duration_category(x/60))
# X_test_1 = log_transformer.transform(X_test)
# X_test_2 = standard_scaler.transform(X_test_1)
# X_test_3 = fill_imX_train['JobCategory'] = X_train.Job.apply(lambda x : job_categories(x))puter.transform(X_test_2)
# X_test_4 = knn_imputer.transform(X_test_3)
# X_test_5 = one_hot_encoder.transform(X_test_4)

############### converted negative balance to zero
X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))

X_test['Job'] = X_test['Job'].fillna('missing')
X_test['JobCategory'] = X_test.Job.apply(lambda x : job_categories(x))
X_test['AgeCategory'] = X_test.Age.apply(lambda x : age_group(x))
X_test['CallDurationCategory'] = X_test.call_duration.apply(lambda x : duration_category(x/60))
X_test['NoOfContactsCategory'] = X_test.NoOfContacts.apply(lambda x : no_of_contacts_category(x))


X_test_features = pipe.transform(X_test)
X_test_features = X_test_features.drop(['NoOfContacts', 'LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [495]:
X_test_features[0:5]

Unnamed: 0,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3,JobCategory_1,JobCategory_2,JobCategory_3,AgeCategory_1,AgeCategory_2,AgeCategory_3,AgeCategory_4,AgeCategory_5,CallDurationCategory_1,CallDurationCategory_2,CallDurationCategory_3,CallDurationCategory_4,NoOfContactsCategory_1,NoOfContactsCategory_2,NoOfContactsCategory_3
200,1,0,0,0,0,1,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0
1078,1,0,0,0,1,0,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0
610,1,0,0,1,0,0,0,1,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0
2159,0,1,0,0,0,1,1,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0
1169,1,0,0,1,0,0,0,0,1,0,0,-0.464205,-0.373862,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0


# Fit the logistic regression model on train dataset and predict on test dataset

In [139]:
lg = LogisticRegression(C=1, max_iter=500)
lg.fit(X_train_features, y_train)

LogisticRegression(C=1, max_iter=500)

In [497]:
lg_pred_train = lg.predict(X_train_features)

In [498]:
accuracy_score(y_train, lg_pred_train)

0.7921875

In [499]:
precision_score(y_train, lg_pred_train)

0.7994269340974212

In [500]:
recall_score(y_train, lg_pred_train)

0.6478328173374613

In [501]:
lg_pred = lg.predict(X_test_features)

In [502]:
accuracy_score(y_test, lg_pred)

0.79875

In [503]:
precision_score(y_test, lg_pred)

0.7870722433460076

In [504]:
recall_score(y_test, lg_pred)

0.6634615384615384

In [140]:
from sklearn.metrics import *

prob = lg.predict_proba(X_test_features)
pos_prob = [x[1] for x in prob]
fpr, tpr, thresholds = roc_curve(y_test, pos_prob, pos_label=1.0)
auc(fpr, tpr)

0.8553554539722572

# Fit the random forest model on train dataset and predict on test dataset

In [137]:
rf = RandomForestClassifier(max_depth=10, min_samples_split=20, n_estimators=30)
rf.fit(X_train_features, y_train)

RandomForestClassifier(max_depth=10, min_samples_split=20, n_estimators=30)

In [506]:
rf.feature_importances_

array([0.01171582, 0.0114638 , 0.0048381 , 0.00665555, 0.00778227,
       0.00647527, 0.06730325, 0.01225666, 0.02690486, 0.04368494,
       0.0048447 , 0.05685492, 0.0339294 , 0.00764378, 0.00501795,
       0.10261575, 0.01155212, 0.0200451 , 0.03020254, 0.0194455 ,
       0.00816723, 0.00870224, 0.01276433, 0.02436204, 0.00585091,
       0.00640065, 0.00492067, 0.03375776, 0.00964141, 0.21966326,
       0.11663543, 0.03012641, 0.0046567 , 0.0110368 , 0.00722997,
       0.00485191])

In [507]:
feature_imp_arr = pd.DataFrame({'columns': X_train_features.columns, 'feature_importances': rf.feature_importances_})

In [508]:
rf_pred_train = rf.predict(X_train_features)

In [509]:
accuracy_score(y_train, rf_pred_train)

0.825

In [510]:
precision_score(y_train, rf_pred_train)

0.833941605839416

In [511]:
recall_score(y_train, rf_pred_train)

0.7074303405572755

In [512]:
rf_pred = rf.predict(X_test_features)

In [513]:
accuracy_score(y_test, rf_pred)

0.8075

In [514]:
precision_score(y_test, rf_pred)

0.7904411764705882

In [515]:
recall_score(y_test, rf_pred)

0.6891025641025641

In [138]:
from sklearn.metrics import *

prob = rf.predict_proba(X_test_features)
pos_prob = [x[1] for x in prob]
fpr, tpr, thresholds = roc_curve(y_test, pos_prob, pos_label=1.0)
auc(fpr, tpr)

0.8728720050441361

# Fit the gradient boosting model on train dataset and predict on test dataset

In [115]:
gb = GradientBoostingClassifier(max_depth=5, min_samples_split=50, n_estimators=50, learning_rate=0.1)
gb.fit(X_train_features, y_train)

GradientBoostingClassifier(max_depth=5, min_samples_split=50, n_estimators=50)

In [116]:
gb_train = gb.predict(X_train_features)

In [117]:
accuracy_score(y_train, gb_train)

0.826875

In [118]:
precision_score(y_train, gb_train)

0.8253968253968254

In [119]:
recall_score(y_train, gb_train)

0.7244582043343654

In [120]:
gb_pred = gb.predict(X_test_features)

In [121]:
accuracy_score(y_test, gb_pred)

0.815

In [122]:
precision_score(y_test, gb_pred)

0.7907801418439716

In [123]:
recall_score(y_test, gb_pred)

0.7147435897435898

In [124]:
prob = gb.predict_proba(X_test_features)

In [134]:
pos_prob = [x[1] for x in prob]

In [135]:
from sklearn.metrics import *
fpr, tpr, thresholds = roc_curve(y_test, pos_prob, pos_label=1.0)

In [136]:
auc(fpr, tpr)

0.8662712799495587

In [105]:
from sklearn.dummy import DummyClassifier

In [107]:
dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X_train_features, y_train)

dummy_pred = dummy_clf.predict(X_test_features)

In [108]:
accuracy_score(y_test, dummy_pred)

0.51

In [109]:
precision_score(y_test, dummy_pred)

0.39690721649484534

In [110]:
recall_score(y_test, dummy_pred)

0.4935897435897436