# Machine Learning Model Building Workflow
Sumudu Tennakoon, PhD

To learn more about Python, refeer to the following websites

* Python : www.python.org
* W3Schools : www.w3schools.com/python

To learn more about the Python packages we explore in this notebook, refeer to the following websites

* NumPy : www.numpy.org
* Matplotlib : www.matplotlib.org
* Pandas : https://pandas.pydata.org
* Scikit-Learn : https://scikit-learn.org/
* Seaborn: https://seaborn.pydata.org/
* StatsModel : https://www.statsmodels.org

In [111]:
import numpy as np
import pandas as pd

from matplotlib import pyplot
import seaborn as sns

import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
from matplotlib import cm # Colomaps
import seaborn as sns
from sklearn import tree

#deep learning
import tensorflow as tf

# Classifier algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#train test split
from sklearn.model_selection import train_test_split

# Model evaluation
from sklearn import metrics

# Load Data

In [112]:
file_name = 'https://raw.githubusercontent.com/SumuduTennakoon/MLFoundations/main/Datasets/income_data.csv'

# Load CSV File

data = pd.read_csv(file_name)
data.sample(20)

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
838,838,24,Private,321435,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0.0,0.0,50.0,United-States,<=50K
16302,16308,25,Private,109609,Some-college,10,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,25.0,United-States,<=50K
39103,14288,19,Private,30597,HS-grad,9,Never-married,Sales,Own-child,White,Male,0.0,0.0,35.0,United-States,<=50K.
16184,16190,39,Private,255503,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0.0,0.0,40.0,United-States,>50K
39134,14319,34,Private,253616,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,60.0,United-States,>50K.
2945,2945,31,Private,176360,HS-grad,9,Never-married,Craft-repair,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
15035,15040,25,State-gov,156848,HS-grad,9,Married-civ-spouse,Protective-serv,Own-child,White,Male,0.0,0.0,35.0,United-States,<=50K
8903,8905,26,Private,152436,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0.0,0.0,50.0,United-States,<=50K
34865,10047,27,Federal-gov,190653,Assoc-voc,11,Married-civ-spouse,Armed-Forces,Husband,White,Male,0.0,0.0,40.0,?,>50K.
6105,6107,39,Private,166744,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0.0,0.0,38.0,United-States,<=50K


# Pre-process Data for Training

In [114]:
data.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [115]:
data.dropna(how='any', axis=0, inplace=True)

In [116]:
data['class'].replace(' >50K.', ' >50K')
data['class'].replace(' <=50K.', ' <=50K')
data['y_act'] = np.where(data['class']==' >50K',1,0)

In [117]:
X_variables = ['age',  'hours_per_week', 'education_num']
data[X_variables].head()

Unnamed: 0,age,hours_per_week,education_num
0,39,40.0,13
1,50,13.0,13
2,38,40.0,9
3,53,40.0,7
4,28,40.0,13


In [118]:
y_varibale = 'y_act'
data[y_varibale].head()

0    0
1    0
2    0
3    0
4    0
Name: y_act, dtype: int32

In [119]:
X = data[X_variables].values
X

array([[39., 40., 13.],
       [50., 13., 13.],
       [38., 40.,  9.],
       ...,
       [38., 50., 13.],
       [44., 40., 13.],
       [35., 60., 13.]])

In [120]:
y = data[y_varibale].values
y

array([0, 0, 0, ..., 0, 0, 0])

## Pre-processing Function for Prediction

In [46]:
def pre_processing(data):
    # Select Columns
    X_variables = ['age',  'hours_per_week', 'education_num']
    return data[X_variables]

# Train Test Split

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(F"Train sample size = {len(X_train)}")
print(F"Test sample size  = {len(X_test)}")

Train sample size = 28765
Test sample size  = 12329


# Model Training Function

In [122]:
def model_train(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    test_result = pd.DataFrame(data={'y_act':y_test, 'y_pred':y_pred, 'y_pred_prob':y_pred_prob})

    accuracy = metrics.accuracy_score(test_result['y_act'], test_result['y_pred']) 
    precision = metrics.precision_score(test_result['y_act'], test_result['y_pred'], average='binary', pos_label=1)
    f1_score = metrics.f1_score(test_result['y_act'], test_result['y_pred'], average='weighted')  #weighted accounts for label imbalance.
    roc_auc = metrics.roc_auc_score(test_result['y_act'], test_result['y_pred_prob'])

    return ({'model_name':model_name, 
                   'model':model, 
                   'accuracy':accuracy, 
                   'precision':precision,
                  'f1_score':f1_score,
                  'roc_auc':roc_auc,
                  })

## Post-processing Function

In [123]:
def post_processing(prediction):
    if len(prediction)==1:
        return prediction[:, 1][0]
    else:
        return prediction[:, 1]

# Fitting Multipe Models with Different Hyperparamaters

In [124]:
models = []

models.append(model_train(RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=3, verbose=3), 'rf1', X_train, y_train, X_test, y_test))
models.append(model_train(RandomForestClassifier(n_estimators=500, max_depth=None, n_jobs=3, verbose=3), 'rf2', X_train, y_train, X_test, y_test))
models.append(model_train(RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=3, verbose=3), 'rf3', X_train, y_train, X_test, y_test))
models.append(model_train(RandomForestClassifier(n_estimators=500, max_depth=20, n_jobs=3, verbose=3), 'rf4', X_train, y_train, X_test, y_test))

models = pd.DataFrame(models)
models

Unnamed: 0,model_name,model,accuracy,precision,f1_score,roc_auc
0,rf1,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.843945,0.398625,0.80824,0.722964
1,rf2,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.843053,0.388215,0.806911,0.725579
2,rf3,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.854571,0.544218,0.798115,0.780668
3,rf4,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.844188,0.390892,0.805942,0.731489


# Grid Search

In [152]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [100,500], 'max_depth': [None, 10, 20]}
gs_model = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=2, verbose=3, pre_dispatch=2)
gs_model.fit(X_train, y_train)          

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:  1.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [153]:
# Best Model Paramaters
print(gs_model.best_params_) 

{'max_depth': 10, 'n_estimators': 100}


In [82]:
from sklearn.metrics import classification_report, confusion_matrix 

y_pred = gs_model.predict(X_test) 

print(classification_report(y_test, y_pred)) 
print(confusion_matrix(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.86      0.99      0.92     10523
           1       0.56      0.04      0.08      1806

    accuracy                           0.85     12329
   macro avg       0.71      0.52      0.50     12329
weighted avg       0.81      0.85      0.80     12329

[[10461    62]
 [ 1727    79]]


# Saving Best Model

In [31]:
import pickle

In [125]:
# Select best model 
model = models.query("model_name=='rf3'")
model 

Unnamed: 0,model_name,model,accuracy,precision,f1_score,roc_auc
2,rf3,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.854571,0.544218,0.798115,0.780668


In [129]:
model['model'].values[0]

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [130]:
save_file = 'model_rf3_test.pickle'
pickle.dump(model['model'].values[0], open(save_file, 'wb'))

# Load Saved Model

In [131]:
model = pickle.load(open(save_file, 'rb'))

# Predict on a Sample Data

In [139]:
sample_input = data.sample(10)
sample_input

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,y_act
10126,37,Private,167851,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,0
12911,18,Private,91525,HS-grad,9,Never-married,Sales,Other-relative,White,Male,0.0,0.0,25.0,United-States,<=50K,0
33259,58,Private,144092,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,35.0,United-States,>50K.,0
11772,35,Private,272338,Assoc-voc,11,Divorced,Prof-specialty,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K,0
10399,22,Private,110200,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0
26066,28,Private,106672,Bachelors,13,Never-married,Sales,Own-child,White,Male,0.0,0.0,2.0,United-States,<=50K.,0
36745,33,Private,198103,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K.,0
15528,29,Private,167716,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0.0,0.0,99.0,United-States,<=50K,0
30669,33,Private,265204,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K.,0
7334,40,Private,220589,Some-college,10,Married-civ-spouse,Sales,Wife,White,Female,0.0,0.0,40.0,United-States,>50K,1


In [140]:
pre_processing(sample_input)

Unnamed: 0,age,hours_per_week,education_num
10126,37,50.0,9
12911,18,25.0,9
33259,58,35.0,14
11772,35,40.0,11
10399,22,40.0,13
26066,28,2.0,13
36745,33,40.0,9
15528,29,99.0,9
30669,33,40.0,9
7334,40,40.0,10


In [141]:
model.predict_proba(pre_processing(sample_input))

array([[8.16188009e-01, 1.83811991e-01],
       [9.99220191e-01, 7.79808793e-04],
       [7.15829951e-01, 2.84170049e-01],
       [8.55277175e-01, 1.44722825e-01],
       [9.46558823e-01, 5.34411771e-02],
       [9.84367682e-01, 1.56323181e-02],
       [9.31913860e-01, 6.80861399e-02],
       [9.55648049e-01, 4.43519515e-02],
       [9.31913860e-01, 6.80861399e-02],
       [8.41022442e-01, 1.58977558e-01]])

In [142]:
post_processing(model.predict_proba(pre_processing(sample_input)))

array([0.18381199, 0.00077981, 0.28417005, 0.14472283, 0.05344118,
       0.01563232, 0.06808614, 0.04435195, 0.06808614, 0.15897756])

In [143]:
sample_input['prediction'] = post_processing(model.predict_proba(pre_processing(sample_input)))
sample_input

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,y_act,prediction
10126,37,Private,167851,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,0,0.183812
12911,18,Private,91525,HS-grad,9,Never-married,Sales,Other-relative,White,Male,0.0,0.0,25.0,United-States,<=50K,0,0.00078
33259,58,Private,144092,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,35.0,United-States,>50K.,0,0.28417
11772,35,Private,272338,Assoc-voc,11,Divorced,Prof-specialty,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K,0,0.144723
10399,22,Private,110200,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,0,0.053441
26066,28,Private,106672,Bachelors,13,Never-married,Sales,Own-child,White,Male,0.0,0.0,2.0,United-States,<=50K.,0,0.015632
36745,33,Private,198103,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K.,0,0.068086
15528,29,Private,167716,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0.0,0.0,99.0,United-States,<=50K,0,0.044352
30669,33,Private,265204,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,Black,Female,0.0,0.0,40.0,United-States,<=50K.,0,0.068086
7334,40,Private,220589,Some-college,10,Married-civ-spouse,Sales,Wife,White,Female,0.0,0.0,40.0,United-States,>50K,1,0.158978


In [144]:
sample_output = post_processing(model.predict_proba(pre_processing(sample_input)))
sample_output

array([0.18381199, 0.00077981, 0.28417005, 0.14472283, 0.05344118,
       0.01563232, 0.06808614, 0.04435195, 0.06808614, 0.15897756])

# Prediction Function for Application

In [71]:
def app_prediction_function(input_data, model):
    return post_processing(model.predict_proba(pre_processing(input_data)))

In [147]:
input_data = data.sample(1)
print(input_data)
app_prediction_function(input_data, model)

      age workclass  fnlwgt    education  education_num marital_status  \
1063   34   Private  157886   Assoc-acdm             12      Separated   

          occupation relationship    race      sex  capital_gain  \
1063   Other-service    Unmarried   White   Female           0.0   

      capital_loss  hours_per_week  native_country   class  y_act  
1063           0.0            40.0   United-States   <=50K      0  


0.09801329557711586

# Use Joblib to save model (supports threading)

In [148]:
import joblib

In [149]:
# Select best model 
model = models.query("model_name=='rf3'")
model 

Unnamed: 0,model_name,model,accuracy,precision,f1_score,roc_auc
2,rf3,"(DecisionTreeClassifier(ccp_alpha=0.0, class_w...",0.854571,0.544218,0.798115,0.780668


In [150]:
save_file = 'model_rf3_test.joblib'
joblib.dump(model['model'].values[0], open(save_file, 'wb'))

In [151]:
model = joblib.load(save_file)

<hr>
Last update 2021-11-07 by Sumudu Tennakoon

In [93]:
input_data.to_dict(orient='records')[0]

{'age': 24,
 'workclass': ' ?',
 'fnlwgt': 83783,
 'education': ' 7th-8th',
 'education_num': 4,
 'marital_status': ' Never-married',
 'occupation': ' ?',
 'relationship': ' Not-in-family',
 'race': ' White',
 'sex': ' Female',
 'capital_gain': 0.0,
 'capital_loss': 0.0,
 'hours_per_week': 25.0,
 'native_country': ' United-States',
 'class': ' <=50K.',
 'y_act': 0}