# Model Training

In [1]:
# Standard library imports
import joblib
import json
import pathlib
import warnings
warnings.filterwarnings("ignore")

# Third-party library imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Read data
data_filepath = pathlib.Path('train.csv')
data = pd.read_csv(data_filepath)

# Create "dummy" columns for categorical data
dummy_column_mapper = {}
for col in data.columns:
    if data[col].dtype == 'object':
        temp = pd.get_dummies(data[col], prefix=col, drop_first=True)
        data = data.drop(columns=[col])
        data[temp.columns] = temp
        dummy_column_mapper[col] = temp.columns.tolist()

# Save mapper for dummy columns
with open('dummy_column_mapper.json', 'w') as fout:
    json.dump(dummy_column_mapper, fout)

# Prepare data for model training
target = 'Exited'
features = [col for col in data.columns if col != target]
binary_columns = [col for col in features if sorted(data[col].unique().tolist()) == [0, 1]]

X = data[features].copy()
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    random_state=69,
)

# Save column order of training data
with open('col_order.json', 'w') as fout:
    json.dump(X_train.columns.tolist(), fout)

# Fit scaler
scaler = StandardScaler()
scaler = scaler.fit(X_train)

# Save scaling information
scaler_filepath = pathlib.Path('scaler_info.json')

scaler_dict  = {}
for feature, mean, scale in zip(features, scaler.mean_, scaler.scale_):
    if feature in binary_columns:
        scaler_dict[feature] = {
            'mean': 0,
            'std': 1,
        }
    else:
        scaler_dict[feature] = {
            'mean': mean,
            'std': scale,
        }
        
with open(scaler_filepath, 'w') as fout:
    json.dump(scaler_dict, fout)
    
# Scale data
for col, col_params in scaler_dict.items():
    X_train.loc[:, col] = (X_train.loc[:, col] - col_params['mean'])/col_params['std']
    X_test.loc[:, col] = (X_test.loc[:, col] - col_params['mean'])/col_params['std']

# Fit random forest model
params = {
    'criterion': ['gini', 'entropy'], 
    'max_depth': [2, 5, 10], 
    'n_estimators': [10, 100], 
}

clf = GridSearchCV(RandomForestClassifier(random_state=0), params, error_score=0)
search = clf.fit(X_train, y_train)
best_params = search.best_params_ 

clf = RandomForestClassifier(random_state=0, **best_params)
clf = clf.fit(X_train.values, y_train.values) 

# Save model
joblib.dump(clf, 'rf_model.joblib')

['rf_model.joblib']

# Simulating deployment

In [2]:
new_data = pd.read_csv(data_filepath)

ckey = 0
raw_payload = new_data.loc[ckey].to_dict()
target = raw_payload.pop('Exited')

raw_payload

{'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67}

In [3]:
with open('dummy_column_mapper.json') as fin:
    dummy_column_mapper = json.load(fin)
    
with open('scaler_info.json') as fin:
    scaler_info = json.load(fin)
    
with open('col_order.json') as fin:
    col_order = json.load(fin)
    
payload = dict(raw_payload)
for column, dummy_columns in dummy_column_mapper.items():
    for dummy_column in dummy_columns:
        payload[dummy_column] = 0
    if column in payload:
        column_val = payload.pop(column)
        target_column = f'{column}_{column_val}'
        payload[target_column] = 1
        
for key, scaler_params in scaler_info.items():
    if key in payload:
        payload[key] = (payload[key] - scaler_params['mean'])/scaler_params['std']
    else:
        payload[key] = scaler_params['mean']
        
ordered_payload = {}
for col in col_order:
    ordered_payload[col] = payload[col]
    
prediction = int(clf.predict(np.array(list(ordered_payload.values())).reshape(1, -1)))

In [4]:
prediction

0

In [5]:
import requests

base_endpoint = 'http://127.0.0.1:5000/'

r = requests.get(base_endpoint)
r.text

predict_endpoint = 'http://127.0.0.1:5000/predict'

r = requests.post(predict_endpoint, json=raw_payload)
int(r.text)

new_data = pd.read_csv(data_filepath)

for ckey in new_data.index.tolist()[:100]:
    raw_payload = new_data.loc[ckey].to_dict()
    target = raw_payload.pop('Exited')
    
    r = requests.post(predict_endpoint, json=raw_payload)
    prediction = int(r.text)
    if prediction == target:
        correct_statement = 'Correct!!!'
    else:
        correct_statement = ''

    print(f'Instance {ckey}: actual->{target}, prediction->{prediction}. {correct_statement}')

export FLASK_ENV=development

In [6]:
base_endpoint = 'https://msba-azure-deployment.azurewebsites.net'

In [7]:
r = requests.get(base_endpoint)
r.text

'App is Healthy'

In [8]:
predict_endpoint = 'https://msba-azure-deployment.azurewebsites.net/predict'

In [9]:
r = requests.post(predict_endpoint, json=raw_payload)
int(r.text)

ValueError: invalid literal for int() with base 10: '<html>\n  <head>\n    <title>Internal Server Error</title>\n  </head>\n  <body>\n    <h1><p>Internal Server Error</p></h1>\n    \n  </body>\n</html>\n'