### Model Prep

In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
plt.style.use('seaborn')  # change the default style

In [2]:
# read csv data into pandas dataframe
df = pd.read_csv('projectdataset-1.csv')

In [3]:
# Prepare the data by separating X and y
# dropping Y variable

# axis = 1 below means dropping by columns, 0 means by rows
X = df.drop(['Class'], axis=1)
y = df['Class']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


In [4]:
# Split the data into a training set and a test set. 
# Any number for the random_state is fine, see 42: https://en.wikipedia.org/wiki/42_(number) 
# We choose to use 20% (test_size=0.2) of the data set as the test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

##added stratify option above



(36168, 16)
(9043, 16)
(36168,)


### Models

In [5]:
# We will train our decision tree classifier with the following features:

num_features = ['age', 'balance', 'day', 'duration', 'pdays' ]
cat_features = ['housing','month','poutcome', 'contact']

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create the preprocessing pipeline for numerical features
# There are two steps in this pipeline
# Pipeline(steps=[(name1, transform1), (name2, transform2), ...]) 
# NOTE the step names can be arbitrary

# Step 1 is what we discussed before - filling the missing values if any using mean
# Step 2 is feature scaling via standardization - making features look like normal-distributed 
# see sandardization: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer()),  # we will tune differet strategies later
        ('scaler', StandardScaler()),
        ]
)

# Create the preprocessing pipelines for the categorical features
# There are two steps in this pipeline:
# Step 1: filling the missing values if any using the most frequent value
# Step 2: one hot encoding

cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

# Assign features to the pipelines and Combine two pipelines to form the preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)

In [7]:
# Specify the model to use, which is DecisionTreeClassifier
# Make a full pipeline by combining preprocessor and the model
from sklearn.tree import DecisionTreeClassifier

pipeline_dt = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('clf_dt', DecisionTreeClassifier()),
    ]
)

In [8]:
# we show how to use GridSearch with K-fold cross validation (K=10) to fine tune the model
# we use the accuracy as the scoring metric with training score return_train_score=True
from sklearn.model_selection import GridSearchCV

# set up the values of hyperparameters you want to evaluate
# here you must use the step names as the prefix followed by two under_scores to sepecify the parameter names and the "full path" of the steps

# we are trying 2 different impputer strategies 
# 2x5 different decision tree models with different parameters
# in total we are trying 2x2x5 = 20 different combinations

param_grid_dt = [
    {
        'preprocessor__num_pipeline__num_imputer__strategy': ['mean', 'median'],
        'clf_dt__criterion': ['gini', 'entropy'], 
        'clf_dt__max_depth': [3, 4, 5, 6, 7],
   
    }
]

# set up the grid search 
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=10, scoring='accuracy')

In [None]:
# train the model using the full pipeline
grid_search_dt.fit(X_train, y_train)

In [None]:
# check the best performing parameter combination
grid_search_dt.best_params_

In [None]:
# build-in CV results keys
sorted(grid_search_dt.cv_results_.keys())

In [None]:
# test score for the 20 decision tree models
grid_search_dt.cv_results_['mean_test_score']

In [None]:
# best decistion tree model test score
grid_search_dt.best_score_

In [None]:
# best test score
print('best dt score is: ', grid_search_dt.best_score_)


In [None]:
# select the best model
# the best parameters are shown, note SimpleImputer() implies that mean strategry is used
clf_best = grid_search_dt.best_estimator_
clf_best

In [None]:
# final test on the testing set
# To predict on new data: simply calling the predict method 
# the full pipeline steps will be applied to the testing set followed by the prediction
y_pred = clf_best.predict(X_test)

In [None]:
clf_best.named_steps

In [None]:
clf_best.named_steps['preprocessor']

In [None]:
onehot_columns = list(clf_best.named_steps['preprocessor'].named_transformers_['cat_pipeline'].named_steps['onehot'].get_feature_names(input_features=cat_features))



In [None]:
i = clf_best.named_steps["clf_dt"].feature_importances_
i

In [None]:
numeric_features_list = list(num_features)
numeric_features_list.extend(onehot_columns)

In [None]:
print(numeric_features_list)


In [None]:
import eli5 as eli5
eli5.explain_weights(clf_best.named_steps["clf_dt"], top=50, feature_names=numeric_features_list, feature_filter=lambda x: x != '<BIAS>')

In [None]:
r = pd.DataFrame(i, index=numeric_features_list, columns=['importance'])
r

print(r.sort_values('importance', ascending = False))

Persist the Model
The following code shows how to save the trained model as a pickle file, which can be loaded in to make predictions.

In [None]:
# try random forest classifer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


# rf pipeline
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf_rf', RandomForestClassifier()),
])

# here we are trying 2x3 different rf models
param_grid_rf = [
    {
        'clf_rf__criterion': ['gini', 'entropy'], 
        'clf_rf__n_estimators': [50, 100, 150],  
    }
]

# set up the grid search 
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=10, scoring='accuracy')

In [None]:
%%time
# train the model using the full pipeline
grid_search_rf.fit(X_train, y_train)

In [None]:
clf_best = grid_search_dt.best_estimator_
y_pred = clf_best.predict(X_test)
y_pred

In [None]:
banker = X_test.iloc[23].to_frame().T
banker.T

In [None]:
banker.shape, X_test.shape

In [None]:
test = clf_best.predict(banker)

In [None]:
# try SVM classifer
from sklearn.svm import SVC

# SVC pipeline
pipeline_svc = Pipeline([
    ('preprocessor', preprocessor),
    ('clf_svc', SVC()),
])

# here we are trying three different kernel and three degree values for polynomail kernel
# in total 5 different combinations
param_grid_svc = [
    {
        'clf_svc__kernel': ['linear', 'poly', 'rbf'], 
        'clf_svc__degree': [3, 4, 5],  # only for poly kernel
    }
]

# set up the grid search 
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=10, scoring='accuracy')

In [None]:
# train the model using the full pipeline
grid_search_svc.fit(X_train, y_train)

In [None]:
# best test score
grid_search_svc.best_score_

### XG boost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

'''
regressor = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)
'''

model = xgb.XGBClassifier()

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('model', model)
])

param_grid = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
    'model__max_depth': [2, 3, 5, 7, 10],
    'model__n_estimators': [10, 100, 500],
}

grid_xgboost = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

#%%time
grid_xgboost.fit(X_train_scaled, y_train)

mean_score = grid.cv_results_["mean_test_score"][grid_xgboost.best_index_]
std_score = grid.cv_results_["std_test_score"][grid_xgboost.best_index_]

grid_xgboost.best_params_, mean_score, std_score #, best_score_

print(f"Best parameters: {best_score_.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")

In [None]:
# best test score
print('best dt score is: ', grid_search_dt.best_score_)
print('best svc score is: ', grid_search_svc.best_score_)
print('best rf score is: ', grid_search_rf.best_score_)

In [None]:
# select the best model
# the best parameters are shown, note SimpleImputer() implies that mean strategry is used
clf_best = grid_search_rf.best_estimator_
clf_best

In [None]:
# Save the model as a pickle file
import joblib
joblib.dump(clf_best, "clf-best.pickle")

In [None]:
# Load the model from a pickle file
saved_tree_clf = joblib.load("clf-best.pickle")
saved_tree_clf

In [None]:
banker1 = pd.DataFrame(
    {   'age' : 33, 
        'job' : 'self-employed',
        'marital' : 'married',
        'education' : 'secondary', 
        'default' : 'no', 
        'balance' : 0, 
        'housing' : 'no',
        'loan' : 'no',
        'contact' : 'cellular',
        'day' : 18, 
        'month' : 'aug', 
        'duration' : 73,
        'campaign' : 7,
        'pdays': -1, 
        'previous' : 0,
        'poutcome' : ['success']       
     
     
    })

clf_best.predict(banker1)

In [None]:
clf_best.predict(banker)

In [None]:
# final test on the testing set
# To predict on new data: simply calling the predict method 
# the full pipeline steps will be applied to the testing set followed by the prediction
y_pred = clf_best.predict(X_test)

# calculate accuracy, Note: y_test is the ground truth for the tesing set
# we have similiar score for the testing set as the cross validation score - good

#print(f'Accuracy Score : {accuracy_score(y_test, y_pred)}')

In [None]:
clf_best.named_steps['preprocessor']