# Import Libraries

In [442]:
import os
import sys
import re
import unicodedata
import pickle

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, hamming_loss

# Path Variables

In [443]:
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'Data')

#Source File Path
in_data_pref = os.path.join(data_dir, 'Input')
src_file_name = 'SampleInput.csv'
src_data_path = os.path.join(in_data_pref , src_file_name)
        
#Read Source File
df = pd.read_csv(src_data_path, index_col=None,
                 parse_dates=["ScheduledDay", "AppointmentDay"], infer_datetime_format=True
                )

#Load and Read Income data
#mean_income_fname = 'mean_incoming_neighborhood.csv'
#neigh_income_range_fname = 'incoming_range_neighborhood.csv'
#mean_income_path = os.path.join(in_data_pref , mean_income_fname)
#neigh_income_range_path = os.path.join(in_data_pref , neigh_income_range_fname)

#mean_income = pd.read_csv(mean_income_path, index_col=None)
#income_ranges = pd.read_csv(neigh_income_range_path, index_col=None)

#Output File Path
op_file_name = 'prediction.json'
op_data_pref = os.path.join(data_dir, 'Output')
op_data_path = os.path.join(op_data_pref , src_file_name)

# Preprocessing

In [444]:
df.columns = df.columns.map(camelcase_to_snakecase)
df.sort_values(["scheduled_day","appointment_day"], inplace=True, ascending=True) 

In [445]:
df["no-show"] = df["no-show"].map({"Yes": True, "No": False})
df["show"] = ~df["no-show"]
del[df["no-show"]]

for feature in ["diabetes", "confirmed", "sms_received"]: print("{}: {}".format(feature, df[feature].unique()))
boolean_features = ["diabetes", "confirmed"]
categorical_features = ["patient_id"]

#df.age = df.age.astype("int")
#df.patient_id = df.patient_id.astype("int")
#df.appointment_id = df.appointment_id.astype("int")

for feature in boolean_features:
    df[feature] = df[feature].astype("bool")

for feature in categorical_features:
    df[feature] = df[feature].astype("category")

diabetes: [0 1]
confirmed: [0 1]
sms_received: [0 1]


# Derived Features

In [24]:
def calculate_prior_noshow(row):
    previous_appoint = df.loc[(df.patient_id == row["patient_id"]) & (df.appointment_day <= row["scheduled_day"]), "show"]
    row["previous_appoint_count"] = len(previous_appoint)
    row["previous_appoint_shows"] = previous_appoint.sum()
    return row

In [446]:
df["days_delta"] = (df.appointment_day - pd.to_datetime(df.scheduled_day.dt.date)).dt.days
#df = df.apply(calculate_prior_noshow, axis=1)
df = df.drop(["patient_id"], axis=1)
df.age = df.age.astype("int")
df.loc[df.age < 0, "age"] = int(df.age.mode())
df.loc[df.days_delta < 0, "days_delta"] = int(df.days_delta.mode())

In [447]:
df.head()

Unnamed: 0,scheduled_day,appointment_day,age,diabetes,confirmed,sms_received,show,days_delta
75143,2015-11-10 07:13:56,2016-05-04,51,False,False,0,True,176
50733,2015-12-03 08:17:28,2016-05-02,34,False,True,0,False,151
41167,2015-12-07 10:40:59,2016-06-03,27,False,True,1,False,179
71768,2015-12-07 10:42:42,2016-06-03,48,True,False,0,True,179
107640,2015-12-07 10:43:01,2016-06-03,80,True,False,1,True,179


# Train Test Split

In [448]:
one_hot_features = pd.get_dummies(df.drop(["show",'scheduled_day', 'appointment_day'], axis=1)).columns

In [449]:
one_hot_features

Index(['age', 'diabetes', 'confirmed', 'sms_received', 'days_delta'], dtype='object')

In [450]:
X = pd.get_dummies(df.drop(["show",'scheduled_day', 'appointment_day'], axis=1)).values
y = df['show'].values
X = X.astype("float64")
y = y.astype("float64")

X = df[one_hot_features]
y = df["show"]

In [451]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=7, shuffle = True)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=.6, random_state=7, shuffle = True)

# Model definition

In [452]:
lg = LogisticRegression()
model_lg = lg.fit(X_train,y_train)
y_pred_lg = model_lg.predict(X_test)



In [453]:
lg_cv_score = cross_val_score(model_lg, X, y, cv=10, scoring='roc_auc')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest Classifier : ", lg_cv_score.mean())

cm = confusion_matrix(y_test, y_pred_lg.round())
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Confusion Matrix :\n', cm)



=== Mean AUC Score ===
Mean AUC Score - Random Forest Classifier :  0.9796902624013875
Confusion Matrix :
 [[0.94895678 0.05104322]
 [0.01427221 0.98572779]]


In [454]:
rf = RandomForestClassifier()
model_rf = rf.fit(X_train,y_train)
y_pred_rf = model_rf.predict(X_test)



In [455]:
rf_cv_score = cross_val_score(model_rf, X, y, cv=10, scoring='roc_auc')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest Classifier : ", rf_cv_score.mean())

cm = confusion_matrix(y_test, y_pred_rf.round())
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Confusion Matrix :\n', cm)

=== Mean AUC Score ===
Mean AUC Score - Random Forest Classifier :  0.9697341973392113
Confusion Matrix :
 [[0.93926975 0.06073025]
 [0.01531191 0.98468809]]


In [427]:
model_rf.predict_proba(X_val.as_matrix()[78].reshape(1, 5))

  """Entry point for launching an IPython kernel.


array([[0., 1.]])

In [456]:
xgbc = XGBClassifier(n_estimators=400, scale_pos_weight=((y_train == 0).sum() / y_train.sum()))
model_xgb = xgbc.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

In [457]:
model_xgb = xgbc.fit(X_train.as_matrix(), y_train.as_matrix())

  """Entry point for launching an IPython kernel.


In [458]:
xgb_cv_score = cross_val_score(model_xgb, X, y, cv=10, scoring='roc_auc')
print("=== Mean AUC Score ===")
print("Mean AUC Score - XGB Classifier : ", xgb_cv_score.mean())

cm = confusion_matrix(y_test, y_pred_xgb.round())
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Confusion Matrix :\n', cm)

=== Mean AUC Score ===
Mean AUC Score - XGB Classifier :  0.9794277058739587
Confusion Matrix :
 [[0.94895678 0.05104322]
 [0.0147448  0.9852552 ]]


In [433]:
model_xgb.predict_proba(X_val.as_matrix()[0].reshape(1, 5))

  """Entry point for launching an IPython kernel.


array([[0.00137436, 0.99862564]], dtype=float32)

In [462]:
inp = np.array([30, 0, 0, 0, 7])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.07307428, 0.9269257 ]], dtype=float32)

In [463]:
inp = np.array([30, 0, 0, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.09669214, 0.90330786]], dtype=float32)

In [464]:
inp = np.array([30, 0, 0, 0, 30])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.06376147, 0.9362385 ]], dtype=float32)

In [465]:
inp = np.array([30, 0, 0, 0, 60])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.06233174, 0.93766826]], dtype=float32)

In [466]:
inp = np.array([30, 0, 0, 0, 90])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.09464824, 0.90535176]], dtype=float32)

In [467]:
inp = np.array([30, 0, 0, 0, 120])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.01701051, 0.9829895 ]], dtype=float32)

In [468]:
inp = np.array([60, 0, 0, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.0932101, 0.9067899]], dtype=float32)

In [469]:
inp = np.array([90, 0, 0, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.33030218, 0.6696978 ]], dtype=float32)

In [470]:
inp = np.array([30, 0, 1, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.99800277, 0.00199721]], dtype=float32)

In [471]:
inp = np.array([60, 0, 1, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.9978452 , 0.00215486]], dtype=float32)

In [472]:
inp = np.array([90, 0, 1, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.99866587, 0.00133414]], dtype=float32)

In [479]:
inp = np.array([60, 0, 0, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.0932101, 0.9067899]], dtype=float32)

In [481]:
inp = np.array([60, 0, 0, 1, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.10355413, 0.8964459 ]], dtype=float32)

In [482]:
inp = np.array([90, 0, 0, 0, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.33030218, 0.6696978 ]], dtype=float32)

In [483]:
inp = np.array([90, 0, 0, 1, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

array([[0.16821569, 0.8317843 ]], dtype=float32)

In [None]:
inp = np.array([90, 0, 0, 1, 14])
model_xgb.predict_proba(inp.reshape(1, 5))

# Generate pickle file with XGBClassifier

In [484]:
#Saving the model to disk
pickle.dump(model_xgb, open('model_f2.pkl', 'wb'))

#Loading model to compare the results
model = pickle.load(open('model_f2.pkl', 'rb'))

In [485]:
model = pickle.load(open('model_f2.pkl', 'rb'))

In [486]:
inp = np.array([60, 0, 0, 0, 7]).reshape(1, 5)

In [487]:
model.predict_proba(inp)[0][0]

0.065886736