# Pipeline export

Import the necessary packages:

In [21]:
import re
import os
import json
import random
import requests
import joblib
import pickle
import pandas as pd
import numpy as np

# SQLite
import sqlite3

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Train-test split and Grid Search
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Feature Engine
from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer, ArbitraryNumberImputer
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Utils
from utils import *

Read the data, and create a copy of it:

In [2]:
df = pd.read_csv('data/train_data.csv')
df_ = df.copy(deep = True)

X-y split:

In [3]:
target = 'readmitted'

X = df_.drop(columns = target)
y = df_[target].apply(lambda x: False if x == 'No' else True)

Remove the records with gender equal to 'Unknown/Invalid':

In [4]:
X = X[X['gender'] != 'Unknown/Invalid']
X = X[X['complete_vaccination_status'] != 'None']

y = y[y.index.isin(X.index)]

X['admission_id'] = X['admission_id'].astype(float)
X['patient_id'] = X['patient_id'].astype(float)
X['admission_source_code'] = X['admission_source_code'].astype(float)
X['time_in_hospital'] = X['time_in_hospital'].astype(float)
X['num_procedures'] = X['num_procedures'].astype(float)
X['number_outpatient'] = X['number_outpatient'].astype(float)
X['number_emergency'] = X['number_emergency'].astype(float)
X['number_inpatient'] = X['number_inpatient'].astype(float)
X['number_diagnoses'] = X['number_diagnoses'].astype(float)

In [5]:
X.dtypes

admission_id                   float64
patient_id                     float64
race                            object
gender                          object
age                             object
weight                          object
admission_type_code            float64
discharge_disposition_code     float64
admission_source_code          float64
time_in_hospital               float64
payer_code                      object
medical_specialty               object
has_prosthesis                    bool
complete_vaccination_status     object
num_lab_procedures             float64
num_procedures                 float64
num_medications                float64
number_outpatient              float64
number_emergency               float64
number_inpatient               float64
diag_1                          object
diag_2                          object
diag_3                          object
number_diagnoses               float64
blood_type                      object
hemoglobin_level         

## Pipeline

Separation of feature types:

In [7]:
# features_to_drop = ['admission_id'
#                     ,'patient_id'
#                     ,'weight'
#                     ,'medical_specialty'
#                     ,'payer_code'
#                     ,'blood_type'
#                     ,'has_prosthesis']

features_to_drop = ['admission_id',
                    'patient_id',
                    'weight',
                    'medical_specialty',
                    'payer_code',
                    'blood_type',
                    'has_prosthesis',
                    'race',
                    'age',
                    'gender',
                   ]

# categorical_features = ['race'
#                         ,'gender'
#                         ,'admission_type_code'
#                         ,'discharge_disposition_code'
#                         ,'admission_source_code'
#                         ,'complete_vaccination_status' # Need to think about this one
#                         ,'blood_transfusion'
#                         , 'max_glu_serum'
#                         ,'A1Cresult'
#                         ,'diuretics'
#                         ,'insulin'
#                         ,'change'
#                         ,'diabetesMed'
#                         ,'diag_1'
#                         ,'diag_2'
#                         ,'diag_3'
#                        ]

categorical_features = ['admission_type_code',
                        'discharge_disposition_code',
                        'admission_source_code',
                        'complete_vaccination_status',
                        'blood_transfusion',
                        'max_glu_serum',
                        'A1Cresult',
                        'diuretics',
                        'insulin',
                        'change',
                        'diabetesMed',
                        'diag_1',
                        'diag_2',
                        'diag_3',
                       ]

categorical_features_ohe_2mf = [# 'gender',
                               'complete_vaccination_status']

# numerical_features = ['age'
#                      ,'time_in_hospital'
#                      ,'num_lab_procedures'
#                      ,'num_procedures'
#                      ,'num_medications'
#                      ,'number_outpatient'
#                      ,'number_emergency'
#                      ,'number_inpatient'
#                      ,'number_diagnoses'
#                      ,'hemoglobin_level'
#                       
#                      ]

numerical_features_imputation = ['num_medications',
                                 'num_lab_procedures',
                                 'num_procedures',
                                 'number_diagnoses',
                                 'time_in_hospital',
                                 'number_outpatient',
                                 'number_emergency',
                                 'number_inpatient',
                                 'number_diagnoses',
                                ]

numerical_features = [ 'hemoglobin_level' ]

In [10]:
pipe = Pipeline([    
    # Remove unwanted columns
    ('drop', DropFeatures(features_to_drop = features_to_drop)),
    
    # Correct type of columns
    ('object', CastString()),
    
    # Correct the admission type column
    ('admission_type', CorrectAdmissionTypeCode()),
    
    # Correct the discharge disposition code column
    ('discharge_code', CorrectDischargeDispositionCode()),
    
    # Correct the admission source code column
    ('admission_code', CorrectAdmissionSourceCode()),
    
    # Correct the max glu serum column
    ('max_glu_serum', CorrectMaxGluSerum()),
     
    # Correct the A1CResult column
    ('a1cresult', CorrectA1CResult()),
    
    # Correct the diag_1, diag_2 and diag_3 columns
    ('diag_x', ConvertDiag()),
    
    # Impute the categorical features with the mode
    ('categorical_imputer', CategoricalImputer(imputation_method = 'frequent',
                                                variables = categorical_features)),
    
    # Impute the numerical features with the median
    ('numerical_imputer_median', MeanMedianImputer(imputation_method = 'median',
                                            variables = numerical_features)),
     
    ('numerical_imputer_0', ArbitraryNumberImputer(arbitrary_number = 0,
                                                 variables = numerical_features_imputation)),
    
    # Rare label encoding
    ('rare', RareLabelEncoder(tol = 0.01, 
                              n_categories = 2,
                              variables = categorical_features)),
    
    # One-hot encoding
    ('ohe', OneHotEncoder(variables = [f for f in categorical_features if f not in categorical_features_ohe_2mf])),
    
    # One-hot encoding of top 2
    ('ohe_diag', OneHotEncoder(top_categories = 2,
                              variables = categorical_features_ohe_2mf)),
    
    # Model
    ('clf', RandomForestClassifier(n_estimators = 1000,
                                   max_depth = 10,
                                   class_weight = 'balanced',
                                   random_state = 42,
                                   n_jobs = -1))

])

Fit on the entire dataset:

In [11]:
pipe.fit(X, y)

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)


Pipeline(steps=[('drop',
                 DropFeatures(features_to_drop=['admission_id', 'patient_id',
                                                'weight', 'medical_specialty',
                                                'payer_code', 'blood_type',
                                                'has_prosthesis', 'race', 'age',
                                                'gender'])),
                ('object', CastString()),
                ('admission_type', CorrectAdmissionTypeCode()),
                ('discharge_code', CorrectDischargeDispositionCode()),
                ('admission_code', CorrectAdmissionSourceCode())...
                                          'discharge_disposition_code',
                                          'admission_source_code',
                                          'blood_transfusion', 'max_glu_serum',
                                          'A1Cresult', 'diuretics', 'insulin',
                                          'change', 'diab

Export the pipeline:

In [12]:
directory = '/home/pgomes/projects/model-deployment/ldssa_capstone_project'

with open(f'{directory}/columns.json', 'w') as fh:
    json.dump(X.columns.tolist(), fh)
    
with open(f'{directory}/dtypes.pickle', 'wb') as fh:
    pickle.dump(X.dtypes, fh)
    
joblib.dump(pipe, f'{directory}/pipeline.pickle');

## Test the API on the localhost

Predict:

In [81]:
url = f"http://localhost:5000/predict"

admission_id = random.randint(200, 1000)
print(admission_id)

observation = {
  "admission_id": admission_id,
  "patient_id": 2,
  "race": 'portuguese',
  "gender": None, 
  "age": '(a00)',
  "weight": '?',
  "admission_type_code": np.nan,
  "discharge_disposition_code": 12.0,
  "admission_source_code": 10.0,
  "time_in_hospital": 123.2,
  "payer_code": 4,
  "medical_specialty": None,
  "has_prosthesis": None,
  "complete_vaccination_status": None,
  "num_lab_procedures": None,
  "num_procedures": None,
  "num_medications": None,
  "number_outpatient": np.nan,
  "number_emergency": None,
  "number_inpatient": None,
  "diag_1": True,
  "diag_2": None,
  "diag_3": None,
  "diag_4" : None,
  "number_diagnoses": None,
  "blood_type": np.nan,
  "hemoglobin_level": True,
  "blood_transfusion": np.nan,
  "max_glu_serum": None,
  "A1Cresult": None,
  "diuretics": None,
  "insulin": None,
  "change": 2,
  "diabetesMed": None,
}

r = requests.post(url, json = observation)

print(r)
print(r.json())

411
<Response [422]>
{'error': "Invalid value provided for change: 2. Allowed values are: 'no','ch'. Please recheck the value you are trying to insert."}


Update:

In [39]:
url = f"http://localhost:5000/update"

observation = {
    "admission_id": 5,
    "readmitted" : 'Banana'    
}

r = requests.post(url, json = observation)
print(r)
print(r.json())

<Response [200]>
{'actual_readmitted': 'Banana', 'admission_id': 5, 'predicted_readmitted': 'No'}


Let's check the SQLite Database:

In [86]:
connection = sqlite3.connect('predictions.db')
cursor = connection.cursor()

query = "SELECT name FROM sqlite_master WHERE type = 'table';"
cursor.execute(query)

print(cursor.fetchall())

[]


## Test the API on Heroku

Predict:

In [18]:
APP_NAME = 'heroku-model-deploy-capstone'
url = "https://{}.herokuapp.com/predict".format(APP_NAME)

observation = {
  "admission_id": 111117,
  "patient_id": 78790,
  "race": None,
  "gender": "Male",
  "age": "[20-30)",
  "weight": "?",
  "admission_type_code": 5,
  "discharge_disposition_code": 13,
  "admission_source_code": 5,
  "time_in_hospital": 3,
  "payer_code": "SP",
  "medical_specialty": "Emergency/Trauma",
  "has_prosthesis": True,
  "complete_vaccination_status": "Complete",
  "num_lab_procedures": 2,
  "num_procedures": '3.o',
  "num_medications": 52,
  "number_outpatient": 0,
  "number_emergency": 3,
  "number_inpatient": 5,
  "diag_1": "428",
  "diag_2": "276",
  "diag_3": "401",
  "number_diagnoses": 3,
  "blood_type": "A+",
  "hemoglobin_level": 14,
  "blood_transfusion": True,
  "max_glu_serum": ">200",
  "A1Cresult": ">8",
  "diuretics": "No",
  "insulin": "Yes",
  "change": "Ch",
  "diabetesMed": "Yes",
}

r = requests.post(url, json = observation)

print(r)
print(r.json())

KeyboardInterrupt: 

Update:

In [92]:
APP_NAME = 'heroku-model-deploy-capstone'
url = "https://{}.herokuapp.com/update".format(APP_NAME)

observation = {
    "admission_id": 111111,
    "readmitted" : 'Yes'    
}

r = requests.post(url, json = json.dumps(observation))
print(r)
print(r.json())

<Response [200]>
{'actual_readmitted': 'Yes', 'admission_id': 111111, 'predicted_readmitted': 'No'}


## Test with the trial files

In [82]:
f1 = open('moment_1_trial.json')
file1 = json.load(f1)
f1.close()

f2 = open('moment_2_trial.json')
file2 = json.load(f2)
f2.close()

In [83]:
# url = f"http://localhost:5000/predict"

APP_NAME = 'heroku-model-deploy-capstone'
url = "https://{}.herokuapp.com/predict".format(APP_NAME)

for obs in file1:
    observation = obs['data']
    
    print(observation['admission_id'])
    
    r = requests.post(url, json = observation)
    
    print(r)
    print(r.json())

82679
<Response [200]>
{'readmitted': 'Yes'}
83311
<Response [200]>
{'readmitted': 'Yes'}
84417
<Response [200]>
{'readmitted': 'No'}
100530
<Response [200]>
{'readmitted': 'Yes'}
81601
<Response [200]>
{'readmitted': 'No'}
84159
<Response [200]>
{'readmitted': 'No'}
99843
<Response [200]>
{'readmitted': 'No'}
100354
<Response [200]>
{'readmitted': 'No'}
90078
<Response [200]>
{'readmitted': 'No'}
87808
<Response [200]>
{'readmitted': 'No'}
101181
<Response [200]>
{'readmitted': 'No'}
98980
<Response [200]>
{'readmitted': 'No'}
87832
<Response [200]>
{'readmitted': 'No'}
86463
<Response [200]>
{'readmitted': 'No'}
86723
<Response [200]>
{'readmitted': 'No'}
83845
<Response [200]>
{'readmitted': 'Yes'}
82181
<Response [200]>
{'readmitted': 'No'}
83097
<Response [200]>
{'readmitted': 'No'}
89734
<Response [200]>
{'readmitted': 'Yes'}
97435
<Response [200]>
{'readmitted': 'No'}
92775
<Response [200]>
{'readmitted': 'No'}
95835
<Response [200]>
{'readmitted': 'No'}
85838
<Response [200]>
{

In [85]:
# url = f"http://localhost:5000/update"

APP_NAME = 'heroku-model-deploy-capstone'
url = "https://{}.herokuapp.com/update".format(APP_NAME)

for obs in file2:
    observation = obs['data']
    
    print(observation)
    
    r = requests.post(url, json = observation)
    
    print(r)
    print(r.json())

{'admission_id': 82679, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 82679, 'predicted_readmitted': 'Yes'}
{'admission_id': 83311, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 83311, 'predicted_readmitted': 'Yes'}
{'admission_id': 84417, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 84417, 'predicted_readmitted': 'No'}
{'admission_id': 100530, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 100530, 'predicted_readmitted': 'Yes'}
{'admission_id': 81601, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 81601, 'predicted_readmitted': 'No'}
{'admission_id': 84159, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 84159, 'predicted_readmitted': 'No'}
{'admission_id': 99843, 'readmitted': 'No'}
<Response [200]>
{'actual_readmitted': 'No', 'admission_id': 99843, 'predicted_readmitted': 'No'}
{