In [1]:
import pandas as pd
import sklearn 
import numpy as np
from sklearn.model_selection import train_test_split
SEED = 42
VAL_SIZE = 0.2

## Data preprocessing

In [6]:
df = pd.read_csv('deployment_pima/datasets/diabetes.csv')

In [12]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


Glucose, BloodPressure, SkinThickness, Insulin, BMI, Age ideally should not be 0. If we see observations which have 0 for these features, we can assume that it is a missing value

In [19]:
possible_miss = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']
(df[possible_miss] == 0).sum()

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
Age                0
dtype: int64

Missingness happens in Glucose, BloodPressure, SkinThickness, Insulin and BMI

In [20]:
missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [70]:
for m in missing:
    df.loc[df[m] == 0, m] = np.nan

Train test split

In [71]:
train, val = train_test_split(df, test_size=VAL_SIZE, random_state=SEED)

In [72]:
X_train = train[[c for c in train.columns if c != 'Outcome']]
y_train = train['Outcome']
X_val = val[[c for c in val.columns if c != 'Outcome']]
y_val = val['Outcome']

Impute

In [73]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [74]:
imp = IterativeImputer(max_iter=30, random_state=SEED)

In [75]:
imp.fit(X_train)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=30, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=42,
                 sample_posterior=False, tol=0.001, verbose=0)

In [76]:
X_train = pd.DataFrame(imp.transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imp.transform(X_val), columns=X_val.columns)

## Train a light gbm model

In [77]:
from lightgbm import LGBMClassifier

In [78]:
lgbm = LGBMClassifier(max_depth=5, n_estimators=500, random_state=SEED)
lgbm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [79]:
training_accuracy = (lgbm.predict(X_train) == y_train).sum()/y_train.shape[0]
val_accuracy = (lgbm.predict(X_val) == y_val).sum()/y_val.shape[0]

In [80]:
val_accuracy

0.7077922077922078

## Save model

In [62]:
import joblib
joblib.dump(lgbm, 'production_model.pkl')

['production_model.pkl']

In [63]:
import pathlib

In [66]:
pathlib.Path('.').resolve().parent

PosixPath('/home/stanleygan/Documents')