In [91]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce   # version 1.2.8
import numpy as np
from sklearn.pipeline import Pipeline

In [92]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [93]:
import lightgbm as lgb
import tensorflow

# Read data

In [141]:
data = pd.read_csv('../data/carInsurance_train.csv')

In [142]:
from sklearn.model_selection import train_test_split

# Holding out 20% of the sample for test dataset

* Performing stratified sampling
* X_train, y_train - training dataset
* X_test, y_test - test dataset (hold-out)

In [143]:
X = data.drop('CarInsurance', axis=1)
target = data['CarInsurance']
X_train, X_test, y_train, y_test = train_test_split(X, target,  test_size=0.2, random_state=1)

# CallStart and CallEnd is converted to call duration

In [144]:
call_duration_arr = []
for index, row in X_train.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_train['call_duration'] = call_duration_arr

call_duration_arr = []
for index, row in X_test.iterrows():
    
    call_start = row['CallStart'] 
    call_end = row['CallEnd']
    call_start_hr = call_start.split(':')[0]
    call_start_min = call_start.split(':')[1]
    call_start_sec = call_start.split(':')[2]
    
    call_end_hr = call_end.split(':')[0]
    call_end_min = call_end.split(':')[1]
    call_end_sec = call_end.split(':')[2]
    
    call_start_dt = dt.datetime(2013,12,30,int(call_start_hr),int(call_start_min),int(call_start_sec))
    call_end_dt = dt.datetime(2013,12,30,int(call_end_hr),int(call_end_min),int(call_end_sec))

    call_duration = (call_end_dt-call_start_dt).total_seconds()
    call_duration_arr.append(call_duration)
    
X_test['call_duration'] = call_duration_arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Removing unneccessary columns - ID, CallStart, CallEnd, LastContactDay, Default

In [145]:

if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_train.columns)))<1:
    X_train = X_train.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)
    
if len(set(['Id', 'CallStart', 'CallEnd', 'LastContactDay']) - set(list(X_test.columns)))<1:
    X_test = X_test.drop(['Id', 'CallStart', 'CallEnd', 'LastContactDay', 'Default'], axis=1)

In [146]:
X_train.isnull().sum()

Age                    0
Job                   16
Marital                0
Education            130
Balance                0
HHInsurance            0
CarLoan                0
Communication        724
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             2434
call_duration          0
dtype: int64

In [147]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# Custom Transformer functions

* Imputation
* Log Transformation
* Normalization

In [148]:
class FillImputer(BaseEstimator, TransformerMixin):
    def __init__(self, fill_dict):
        super().__init__()
        self.fill_dict = fill_dict

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_

        X_copy = X.copy()
        X_copy = X_copy.fillna(self.fill_dict)

        return X_copy
    
class kNNImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.kneigh = KNeighborsClassifier(n_neighbors=3)
        self.ce_one_hot = ce.OneHotEncoder(cols = ['Job','Marital'])
        self.ce_one_hot_model = None

    def fit(self, X, y=None):
#         X = X.to_numpy()
#         self.means_ = X.mean(axis=0, keepdims=True)
#         self.std_ = X.std(axis=0, keepdims=True)
        imputation_features_non_missing = X[~X.Education.isnull()][['Job', 'Marital', 'Age']]
        education_non_missing = X[~X.Education.isnull()]['Education']
        
        self.ce_one_hot_model = self.ce_one_hot.fit(imputation_features_non_missing)
        imputation_features_non_missing_encoded = self.ce_one_hot_model.transform(imputation_features_non_missing)
        
        self.kneigh.fit(X=imputation_features_non_missing_encoded, y=education_non_missing)
    
        return self
    

    def transform(self, X, y=None):
#         X[:] = (X.to_numpy() - self.means_) / self.std_
    
#         non_missing = X[~X.Education.isnull()]
#         missing = X[X.Education.isnull()]

        X_copy = X.copy()
    
        missing_ind = X_copy[X_copy.Education.isnull()].index
        education = X_copy.Education.copy()
       
        imputation_features_missing = X_copy[X_copy.Education.isnull()][['Job', 'Marital', 'Age']]
        education_missing = X_copy[X_copy.Education.isnull()]['Education']
                
        imputation_features_missing_encoded = self.ce_one_hot_model.transform(imputation_features_missing)
        missing_values_pred = self.kneigh.predict(imputation_features_missing_encoded)

        counter = 0
        for index in missing_ind:
            education[index] = missing_values_pred[counter]
            counter+=1
        
        X_copy['Education'] = education

        return X_copy

In [149]:
from sklearn.preprocessing import MinMaxScaler as min_max
from sklearn.preprocessing import StandardScaler

class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        for col in self.columns:
            X_copy[col] = np.log(X_copy[col]+1)
        

        return X_copy

class MinMaxScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = min_max()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy
    
class StandardScalarMultiple(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        self.normalizer = None

    def fit(self, X, y=None):
        self.normalizer = StandardScaler()
        self.normalizer.fit(X[self.columns])

        return self

    def transform(self, X, y=None):
        
        X_copy = X.copy()

        normalized_vals = self.normalizer.transform(X[self.columns])
        
        for i in range(0, len(self.columns)):
            col = [row[i] for row in normalized_vals]
            X_copy[self.columns[i]] = col
        

        return X_copy

In [333]:
def quarter_month(x):
    if x in ['jan', 'feb', 'mar', 'apr']:
        return '1'
    elif x in ['may', 'jun', 'jul', 'aug']:
        return '2'
    elif x in ['sep', 'oct', 'nov', 'dec']:
        return '3'
    
def job_categories(x):
    if x in ['blue-collar', 'entrepreneur', 'housemaid']:
        return 'job1'
    elif x in ['services', 'self-employed', 'admin.', 'technician', 'management', 'missing']:
        return 'job2'
    elif x in ['unemployed', 'retired', 'student']:
        return 'job3'
    else:
        return 'no_group'
    
def age_group(x):
    if x>=18 and x<=30:
        return '18-30'
    elif x>30 and x<=60:
        return '30-60'
    elif x>60:
        return '>60'
    
def duration_category(x):
    
    if (x>0) and (x<=10):
        return '0-10'
    elif (x>10) and (x<=20):
        return '10-20'
    elif (x>20) and (x<=30):
        return '20-30'
    elif (x>30):
        return '>30'

# def duration_category(x):
    
#     if (x>0) and (x<=10):
#         return '0-10'
#     else: return '>10'
    

In [334]:
############### converted negative balance to zero
X_train['Balance'] = X_train.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_train['IsPreviouslyContacted'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_train['DaysPassed'] = X_train.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_train['LastContactQuadrimester'] = X_train.LastContactMonth.apply(lambda x : quarter_month(x))

X_train['Job'] = X_train['Job'].fillna('missing')
X_train['JobCategory'] = X_train.Job.apply(lambda x : job_categories(x))
X_train['AgeCategory'] = X_train.Age.apply(lambda x : age_group(x))
X_train['CallDurationCategory'] = X_train.call_duration.apply(lambda x : duration_category(x/60))

# Create a sklearn pipeline for feature engineering - imputation, encoding, scaling

In [335]:
log_transformer = LogTransformer(columns=['call_duration'])
standard_scaler = StandardScalarMultiple(columns=['Age', 'Balance', 'NoOfContacts', 'PrevAttempts', 'DaysPassed'])
fill_imputer = FillImputer(fill_dict={'Communication': 'missing', 'Job' : 'missing', 'Outcome' : 'not_contacted'})
knn_imputer = kNNImputer()
one_hot_encoder = ce.OneHotEncoder(cols=['CallDurationCategory','JobCategory', 'AgeCategory', 'Marital', 'Education', 'Outcome', 'Communication', 'LastContactQuadrimester'])


pipe = Pipeline([('log_transformer', log_transformer), 
                 ('standard_scaler', standard_scaler),
                 ('fill_imputer', fill_imputer),
                 ('knn_imputer', knn_imputer),
                 ('one_hot_encoder', one_hot_encoder)])

pipe.fit(X_train)
X_train_features = pipe.transform(X_train)
X_train_features = X_train_features.drop(['LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [336]:
X_train_features[0:10]

Unnamed: 0,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,NoOfContacts,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3,JobCategory_1,JobCategory_2,JobCategory_3,AgeCategory_1,AgeCategory_2,AgeCategory_3,CallDurationCategory_1,CallDurationCategory_2,CallDurationCategory_3,CallDurationCategory_4
2996,1,0,0,1,0,0,0,0,1,0,0,-0.196231,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
3295,1,0,0,1,0,0,1,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0
2284,0,1,0,0,1,0,1,0,0,1,0,-0.521084,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
2793,1,0,0,0,1,0,0,0,1,0,0,-0.196231,1.424987,2.276469,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
787,1,0,0,0,0,1,0,0,1,0,0,1.103179,-0.464205,-0.373862,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0
2046,1,0,0,0,0,1,0,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0
739,0,0,1,1,0,0,1,0,0,1,0,1.752885,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
2149,0,0,1,1,0,0,1,0,0,1,0,-0.196231,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
3369,0,1,0,0,1,0,0,0,1,0,0,6.300822,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
3492,0,0,1,1,0,0,1,1,1,0,0,-0.196231,-0.464205,-0.373862,1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0


# Create a logistic regression model and performing cross validation using training dataset

In [337]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [338]:
lg = LogisticRegression(max_iter=500)

In [339]:
lg_precision_scores = cross_val_score(lg, X_train_features, y_train, cv=3, scoring='precision')

In [340]:
lg_precision_scores

array([0.79881657, 0.78512397, 0.80122324])

# Create a random forest model and performing cross validation using training dataset

In [361]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [366]:
num_trees = [5, 10, 20, 30, 50, 100]
max_depth = [5, 10, 15, 20]

for trees in num_trees:
    for depth in max_depth: 
        rf = RandomForestClassifier(n_estimators=trees,max_depth=depth, min_samples_split=10, min_samples_leaf=10)
        rf_precison_scores = cross_val_score(rf, X_train_features, y_train, cv=3, scoring='precision')
        print(rf_precison_scores, '  ', trees, '  ', depth, '  ', np.mean(rf_precison_scores))

[0.83333333 0.81595092 0.81993569]    5    5    0.8230733149656867
[0.78994083 0.81791045 0.75505051]    5    10    0.7876339270713554
[0.7712766  0.77078086 0.76701571]    5    15    0.7696910529913792
[0.78       0.76657825 0.78510029]    5    20    0.7772261786232738
[0.80130293 0.8119403  0.82392027]    10    5    0.8123878319614283
[0.77310924 0.77748691 0.78706199]    10    10    0.7792193831004693
[0.78674352 0.78215223 0.80167598]    10    15    0.790190574824968
[0.79104478 0.78835979 0.77718833]    10    20    0.785530964463886
[0.79402985 0.82568807 0.81360947]    20    5    0.8111091305321284
[0.79821958 0.8        0.78753541]    20    10    0.7952516651115351
[0.78901734 0.78740157 0.80392157]    20    15    0.793446828157021
[0.79710145 0.79069767 0.76863753]    20    20    0.7854788852758811
[0.81612903 0.8119403  0.81818182]    30    5    0.8154170496491151
[0.79261364 0.78835979 0.8079096 ]    30    10    0.7962943430810663
[0.78813559 0.78851175 0.77806789]    30    1

In [367]:
rf = RandomForestClassifier()

In [368]:
cross_val_score(rf, X_train_features, y_train, cv=3, scoring='precision')

array([0.75656325, 0.72813239, 0.72019465])

# Create a gradient boosting model and performing cross validation using training dataset

In [306]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [307]:
gd = GradientBoostingClassifier()

In [308]:
gd_precision_scores = cross_val_score(gd, X_train_features, y_train, cv=3, scoring='precision')

In [309]:
gd_precision_scores

array([0.79108635, 0.78680203, 0.81337047])

In [377]:
num_trees = [5, 10, 20, 30, 50, 100]
max_depth = [5, 10, 15, 20]

for trees in num_trees:
    for depth in max_depth: 
        gd = GradientBoostingClassifier(n_estimators=trees,max_depth=depth, min_samples_split=10, min_samples_leaf=10)
        gd_precison_scores = cross_val_score(gd, X_train_features, y_train, cv=3, scoring='precision')
        print(gd_precison_scores, '  ', trees, '  ', depth, '  ', np.mean(gd_precison_scores))

[0.83941606 0.82771536 0.86516854]    5    5    0.8440999845084156
[0.83391003 0.80996885 0.82550336]    5    10    0.823127412552933
[0.81424149 0.79289941 0.79216867]    5    15    0.7997698563503102
[0.81677019 0.79289941 0.79216867]    5    20    0.8006127564394075
[0.82178218 0.80712166 0.82242991]    10    5    0.8171112488269819
[0.80232558 0.7972973  0.75925926]    10    10    0.7862940459839685
[0.78918919 0.76214834 0.78225806]    10    15    0.7778651971004087
[0.78706199 0.76349614 0.78284182]    10    20    0.7777999872081112
[0.80538922 0.80397727 0.8173913 ]    20    5    0.8089192662106618
[0.77894737 0.78125    0.75510204]    20    10    0.771766469745793
[0.76470588 0.7549505  0.7638191 ]    20    15    0.7611584909599444
[0.76708861 0.75123153 0.77099237]    20    20    0.7631041670335822
[0.79096045 0.78974359 0.78378378]    30    5    0.7881626085015916
[0.7755102  0.78061224 0.74494949]    30    10    0.7670239813096956
[0.77020202 0.75490196 0.76010101]    30    

In [378]:
num_trees = [5, 10, 20, 30, 50, 100]
max_depth = [5, 10, 15, 20]

for trees in num_trees:
    for depth in max_depth: 
        gd = GradientBoostingClassifier(n_estimators=trees,max_depth=depth)
        gd_precison_scores = cross_val_score(gd, X_train_features, y_train, cv=3, scoring='recall')
        print(gd_precison_scores, '  ', trees, '  ', depth, '  ', np.mean(gd_precison_scores))

[0.51508121 0.52204176 0.54418605]    5    5    0.5271030054497383
[0.52436195 0.56148492 0.54418605]    5    10    0.5433443047536827
[0.5638051  0.57540603 0.59069767]    5    15    0.5766362704365187
[0.60324826 0.59396752 0.59069767]    5    20    0.595971150560262
[0.55916473 0.64269142 0.59069767]    10    5    0.5975179409701613
[0.58468677 0.6287703  0.59534884]    10    10    0.6029353045918092
[0.59396752 0.60324826 0.59069767]    10    15    0.595971150560262
[0.63341067 0.62180974 0.60930233]    10    20    0.6215075810716021
[0.62180974 0.67053364 0.6372093 ]    20    5    0.643184229932193
[0.63109049 0.63109049 0.62790698]    20    10    0.6300293170740481
[0.62645012 0.63573086 0.61860465]    20    15    0.6269285418802496
[0.63109049 0.6287703  0.62093023]    20    20    0.6269303404737495
[0.64037123 0.70069606 0.6627907 ]    30    5    0.6679526610190831
[0.66357309 0.64269142 0.63488372]    30    10    0.6470494073634417
[0.65429234 0.62645012 0.63255814]    30    1

# Apply pipeline on test dataset

In [351]:
# X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)
# X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)
# X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)
# X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))
# X_test['Job'] = X_test['Job'].fillna('missing')
# X_test['JobCategory'] = X_test.Job.apply(lambda x : job_categories(x))
# X_test['AgeCategory'] = X_test.Age.apply(lambda x : age_group(x))
# X_train['CallDurationCategory'] = X_train.call_duration.apply(lambda x : duration_category(x/60))
# X_test_1 = log_transformer.transform(X_test)
# X_test_2 = standard_scaler.transform(X_test_1)
# X_test_3 = fill_imX_train['JobCategory'] = X_train.Job.apply(lambda x : job_categories(x))puter.transform(X_test_2)
# X_test_4 = knn_imputer.transform(X_test_3)
# X_test_5 = one_hot_encoder.transform(X_test_4)

############### converted negative balance to zero
X_test['Balance'] = X_test.Balance.apply(lambda x : 0 if x<0 else x)

############### create a new variable for indicting whether a client is previously contacted - Boolean
X_test['IsPreviouslyContacted'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else 1)

############### convert -ve values for days passed
X_test['DaysPassed'] = X_test.DaysPassed.apply(lambda x : 0 if x==-1 else x)

############### convert month to quadrimester
X_test['LastContactQuadrimester'] = X_test.LastContactMonth.apply(lambda x : quarter_month(x))

X_test['Job'] = X_test['Job'].fillna('missing')
X_test['JobCategory'] = X_test.Job.apply(lambda x : job_categories(x))
X_test['AgeCategory'] = X_test.Age.apply(lambda x : age_group(x))
X_test['CallDurationCategory'] = X_test.call_duration.apply(lambda x : duration_category(x/60))

X_test_features = pipe.transform(X_test)
X_test_features = X_test_features.drop(['LastContactMonth', 'Job', 'Age', 'call_duration', 'Balance'], axis=1)

In [352]:
X_test_features[0:5]

Unnamed: 0,Marital_1,Marital_2,Marital_3,Education_1,Education_2,Education_3,HHInsurance,CarLoan,Communication_1,Communication_2,Communication_3,NoOfContacts,DaysPassed,PrevAttempts,Outcome_1,Outcome_2,Outcome_3,Outcome_4,IsPreviouslyContacted,LastContactQuadrimester_1,LastContactQuadrimester_2,LastContactQuadrimester_3,JobCategory_1,JobCategory_2,JobCategory_3,AgeCategory_1,AgeCategory_2,AgeCategory_3,CallDurationCategory_1,CallDurationCategory_2,CallDurationCategory_3,CallDurationCategory_4
200,1,0,0,0,0,1,0,0,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0
1078,1,0,0,0,1,0,0,0,1,0,0,0.128621,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0
610,1,0,0,1,0,0,0,1,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0
2159,0,1,0,0,0,1,1,0,1,0,0,-0.521084,-0.464205,-0.373862,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0
1169,1,0,0,1,0,0,0,0,1,0,0,0.453474,-0.464205,-0.373862,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0


# Fit the logistic regression model on train dataset and predict on test dataset

In [168]:
lg = LogisticRegression(max_iter=500)
lg.fit(X_train_features, y_train)

LogisticRegression(max_iter=500)

In [169]:
lg_pred = lg.predict(X_test_features)

In [170]:
accuracy_score(y_test, lg_pred)

0.8075

In [171]:
precision_score(y_test, lg_pred)

0.7564935064935064

In [172]:
recall_score(y_test, lg_pred)

0.7467948717948718

# Fit the random forest model on train dataset and predict on test dataset

In [369]:
rf = RandomForestClassifier(n_estimators=10, max_depth=5)
rf.fit(X_train_features, y_train)

RandomForestClassifier(max_depth=5, n_estimators=10)

In [370]:
rf.feature_importances_

array([1.05088281e-02, 3.57374000e-03, 1.87840779e-03, 9.74317846e-05,
       5.43554820e-03, 1.35986686e-03, 4.12018503e-02, 4.05792028e-03,
       6.14616495e-02, 2.65285130e-02, 3.77843318e-03, 7.76023463e-03,
       5.37573657e-02, 2.58504587e-02, 4.90383948e-02, 1.95552793e-03,
       1.10998177e-01, 5.72422921e-03, 0.00000000e+00, 4.01726817e-02,
       1.44331060e-02, 4.56249780e-04, 2.51163511e-03, 2.27914406e-02,
       9.46558337e-03, 2.92992904e-02, 3.95546946e-02, 1.09370759e-02,
       2.43862901e-01, 1.30412078e-01, 3.74136009e-02, 3.72308508e-03])

In [371]:
feature_imp_arr = pd.DataFrame({'columns': X_train_features.columns, 'feature_importances': rf.feature_importances_})

In [372]:
# feature_imp_arr.sort_values(by='feature_importances', ascending=False)

In [373]:
rf_pred = rf.predict(X_test_features)

In [374]:
accuracy_score(y_test, rf_pred)

0.80375

In [375]:
precision_score(y_test, rf_pred)

0.8112449799196787

In [376]:
recall_score(y_test, rf_pred)

0.6474358974358975

# Fit the gradient boosting model on train dataset and predict on test dataset

In [384]:
gb = GradientBoostingClassifier(n_estimators=100, max_depth=5)
gb.fit(X_train_features, y_train)

GradientBoostingClassifier(max_depth=5)

In [385]:
gb_pred = gb.predict(X_test_features)

In [386]:
accuracy_score(y_test, gb_pred)

0.805

In [387]:
precision_score(y_test, gb_pred)

0.7708333333333334

In [388]:
recall_score(y_test, gb_pred)

0.7115384615384616