__Author__ - Vrisha Parekh

__Email__ - parekh.vrisha@gmail.com


__LinkedIn__ - https://bit.ly/VrishaParekh_LinkedIn

In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from numpy import mean
from numpy import std
from scipy.stats import norm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report,confusion_matrix,recall_score,precision_score,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from scipy.stats import boxcox

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
#Loading data
def load_csv(file):
    #load csv file
    data=pd.read_csv(file,sep=';')
    df=pd.DataFrame(data)
    return df


#Removing the duplicates
def clean_df(df):
    clean_data = df.drop_duplicates()
    return clean_data


#Removing extreme values
def remove_extreme_age(df):
    #Removing age values less than 18 yrs as they would not be eligible for term deposit.
    df=df[df['age']>18]
    
    #Dropping the extreme campaign values
    df=df[df['campaign']<56]
    return df


#Transforming feature
def transform_features(df):
    #Age is right skewed so log transforming age
    df['age']=np.log(df['age']+1)
    return df


#Dropping column
def drop_columns(df,col_name):
    #dropping duration as it highly influences our response variable
    df.drop(col_name,inplace=True,axis=1)
    return df


#Replacing yes and no in response variable with 0 and 1
def replace_response_variable(df):
    df['y'].replace(['yes','no'],[1,0],inplace=True)
    return df


#Converting categorical features to dummy variables
def get_dummies_func(df,cat_list):
    #Creating dummies for categorical data 
    return pd.concat([pd.get_dummies(data=df,columns=cat_list)],axis=1)


#Spliting the data into Train and test sets
def split(col_name):
    
    X= df[[i for i in list(df.columns) if i!=col_name]].values
    y=df[col_name]
    
    X_train,X_test,y_train,y_test=sklearn_train_test_split(X,y,test_size=0.25,random_state=42)
    
    #Scaling the splitted data
    scaler=preprocessing.StandardScaler()
    scaler.fit(X_train)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)
    return X_train,X_test,y_train,y_test

#Observing the baseline performance of three different models, Logistic Regression, Decision tree and Random Forest

def benchmark_models(model):

    initial_models=model.fit(X_train,y_train)
    y_pred=initial_models.predict(X_test)
    reportss=classification_report(y_test,y_pred)
    return reportss


#Using a mix of two sampling techniques(Smote-Oversampling,Tomek-Undersampling)
def sampling():
    
    Xdash= df[[i for i in list(df.columns) if i!='y']]
    columns=Xdash.columns

    smt=SMOTETomek(sampling_strategy= 'auto')
    X_smt,y_smt=smt.fit_sample(X_train,y_train)
    
    #Creating dataframes
    X_smt_df=pd.DataFrame(data=X_smt,columns=columns)
    y_smt_df=pd.DataFrame(data=y_smt,columns=['y'])
    
    #Checking the number of samples for both the classes
    print('Number of NO subscription in oversampled data',len(y_smt_df[y_smt_df['y']==0]))
    print('Number of YES subscription in oversampled data',len(y_smt_df[y_smt_df['y']==1]))
    return X_smt,y_smt,X_smt_df


#Defining summary metric
def summary_metrics(y_pred):
    
    conf_matrix= confusion_matrix(y_test, y_pred)
    
    print('confusion matrix',conf_matrix)
    print('Accuracy',accuracy_score(y_test,y_pred))
    print('Precision',precision_score(y_test,y_pred))
    print('Recall',recall_score(y_test,y_pred))
    


#Applying rfe and cross valuation to our sampled data

def elimination_crossval(model):
    
    #Initiating the RFE instance
    rfe=RFE(estimator=RandomForestClassifier(),n_features_to_select=10)
    
    #Fitting the rfe
    X_rfe=rfe.fit_transform(X_smt,y_smt)
    
    #Transforming X_test
    X_rfe_test=rfe.transform(X_test)
    
    model=model
    
    #Creating pipeling to avoid data leakage
    pipeline=Pipeline(steps=[('s',rfe),('m',model)])
    
    cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=1)
    
    scores =cross_val_score(pipeline,X_rfe, y_smt, scoring='accuracy', cv=cv, n_jobs=-1)
    
    print('Accuracy for model with cross val: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))
    
    #Fitting the pipeline
    fitted_model=pipeline.fit(X_rfe,y_smt)
    
    y_pred=fitted_model.predict(X_rfe_test)
    
    #Printing the classification report
    print(classification_report(y_test,y_pred))
    
    summary_metrics(y_pred)
    
    
#Getting the important features
    
def important_features(estimator,n_features_to_select):
    
    rfe=RFE(estimator=estimator, n_features_to_select=n_features_to_select)
    X_rfe=rfe.fit_transform(X_smt,y_smt)


    columns = X_smt_df.columns
    val = pd.Series(rfe.support_,index = columns)
    features_chosen_rfe = val[val==True].index 
    print(features_chosen_rfe)
    

In [3]:
#Loading data
df=load_csv('bank-additional-full.csv')

#Removing the duplicates
df=clean_df(df)

#Removing extreme values
df=remove_extreme_age(df)

#Transforming feature
df=transform_features(df)

#Dropping column
df=drop_columns(df,'duration')

#Replacing yes and no in response variable with 0 and 1
df=replace_response_variable(df)

#Converting categorical features to dummy variables
cat_list= ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact','month', 'day_of_week', 'poutcome']

df=get_dummies_func(df,cat_list)

#Spliting the data into Train and test sets
X_train,X_test,y_train,y_test=split('y')


#Observing the baseline performance of three different models, Logistic Regression, Decision tree and Random Forest
reports_logistic=benchmark_models(LogisticRegression(max_iter=7600))
reports_Decisiontree=benchmark_models(DecisionTreeClassifier())
reports_RandomForest=benchmark_models(RandomForestClassifier())


#Using a mix of two sampling techniques(Smote-Oversampling,Tomek-Undersampling)
X_smt,y_smt,X_smt_df=sampling()


#Applying rfe and cross valuation to our sampled data
elimination_crossval(LogisticRegression(max_iter=7600))
elimination_crossval(RandomForestClassifier())

#Getting the important features
important_features(RandomForestClassifier(),4)

Number of NO subscription in oversampled data 27117
Number of YES subscription in oversampled data 27117
Accuracy for model with cross val: 72.445 (0.578)
              precision    recall  f1-score   support

           0       0.95      0.74      0.83      9154
           1       0.25      0.70      0.37      1132

    accuracy                           0.74     10286
   macro avg       0.60      0.72      0.60     10286
weighted avg       0.88      0.74      0.78     10286

confusion matrix [[6781 2373]
 [ 338  794]]
Accuracy 0.7364378767256465
Precision 0.25071045153141774
Recall 0.7014134275618374
Accuracy for model with cross val: 92.028 (0.265)
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      9154
           1       0.41      0.37      0.39      1132

    accuracy                           0.87     10286
   macro avg       0.67      0.65      0.66     10286
weighted avg       0.87      0.87      0.87     10286

confusion mat