# Import Libraries

In [1]:
import os
import sys
import re
import unicodedata
import pickle

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, hamming_loss

# Path Variables

In [2]:
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'Data')

#Source File Path
in_data_pref = os.path.join(data_dir, 'Input')
src_file_name = 'KaggleV2-May-2016.csv'
src_data_path = os.path.join(in_data_pref , src_file_name)
        
#Read Source File
df = pd.read_csv(src_data_path, index_col=None,
                 parse_dates=["ScheduledDay", "AppointmentDay"], infer_datetime_format=True
                )

#Load and Read Income data
mean_income_fname = 'mean_incoming_neighborhood.csv'
neigh_income_range_fname = 'incoming_range_neighborhood.csv'
mean_income_path = os.path.join(in_data_pref , mean_income_fname)
neigh_income_range_path = os.path.join(in_data_pref , neigh_income_range_fname)

mean_income = pd.read_csv(mean_income_path, index_col=None)
income_ranges = pd.read_csv(neigh_income_range_path, index_col=None)

#Output File Path
op_file_name = 'prediction.json'
op_data_pref = os.path.join(data_dir, 'Output')
op_data_path = os.path.join(op_data_pref , src_file_name)

# Preprocessing

In [3]:
def camelcase_to_snakecase(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

In [4]:
df.columns = df.columns.map(camelcase_to_snakecase)
df.rename(columns={'hipertension':'hypertension', 'handcap':'handicap'}, inplace=True)
df.sort_values(["scheduled_day","appointment_day"], inplace=True, ascending=True) 
mean_income.rename(columns={mean_income.columns[0]: 'neighbourhood', mean_income.columns[1]: 'neigh_mean_income'}, inplace=True)
income_ranges.rename(columns={"Mesorregiões, microrregiões, municípios, distritos, subdistritos e bairros":
                                'neighbourhood',
                            "Sem rendimento (2)": 'neigh_income_range_0',
                            "Até ½ salário mínimo": 'neigh_income_range_1',
                            "Mais de 1/2 a 1 salário mínimo": 'neigh_income_range_2',
                            "Mais de 1 a 2 salário mínimo": 'neigh_income_range_3',
                            "Mais de 2 a 5 salário mínimo": 'neigh_income_range_4',
                            "Mais de 5 a 10 salário mínimo": 'neigh_income_range_5',
                            "Mais de 10 a 20 salário mínimo": 'neigh_income_range_6',
                            "Mais de 20 salário mínimo": 'neigh_income_range_7',
                            },
                    inplace=True)

In [5]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

df.neighbourhood = df.neighbourhood.str.lower()
mean_income.neighbourhood = mean_income.neighbourhood.str.lower()
income_ranges.neighbourhood = income_ranges.neighbourhood.str.lower()

df.neighbourhood = df.neighbourhood.apply(strip_accents)
mean_income.neighbourhood = mean_income.neighbourhood.apply(strip_accents)
income_ranges.neighbourhood = income_ranges.neighbourhood.apply(strip_accents)

# aeroporto isn't a real neighbourhood, changing to nearest one
df.loc[df.neighbourhood == "aeroporto", "neighbourhood"] = "jardim camburi"
# Fixes apostrophe
df.loc[df.neighbourhood == "joana d´arc", "neighbourhood"] = "joana d'arc"
# removing 2 patients from island
df = df[df.neighbourhood != "ilhas oceanicas de trindade"]

df = pd.merge(df, mean_income, left_on="neighbourhood", right_on="neighbourhood", how='left', sort=False)
# Check if there is any missing neighbourhood
df.loc[df.neigh_mean_income.isnull(), ["neighbourhood", "neigh_mean_income"]]

df = pd.merge(df, income_ranges, left_on="neighbourhood", right_on="neighbourhood", how='left', sort=False)
# Check if there is any missing neighbourhood
df.loc[df.neigh_income_range_0.isnull(), ["neighbourhood", "neigh_income_range_0"]]

df["no-show"] = df["no-show"].map({"Yes": True, "No": False})
df["show"] = ~df["no-show"]
del[df["no-show"]]

for feature in ["diabetes", "alcoholism", "hypertension", "handicap", "scholarship", "sms_received", "neighbourhood"]: print("{}: {}".format(feature, df[feature].unique()))
boolean_features = ["diabetes", "alcoholism", "hypertension", "sms_received", "scholarship"]
categorical_features = ["gender", "handicap", "neighbourhood", "patient_id", "appointment_id"]

#df.age = df.age.astype("int")
#df.patient_id = df.patient_id.astype("int")
#df.appointment_id = df.appointment_id.astype("int")

for feature in boolean_features:
    df[feature] = df[feature].astype("bool")

for feature in categorical_features:
    df[feature] = df[feature].astype("category")

diabetes: [0 1]
alcoholism: [0 1]
hypertension: [0 1]
handicap: [0 1 2 3 4]
scholarship: [0 1]
sms_received: [1 0]
neighbourhood: ['resistencia' 'vila rubim' 'sao cristovao' 'maruipe' 'santa cecilia'
 'tabuazeiro' 'caratoira' 'conquista' 'santos dumont' 'santo andre'
 'redencao' 'bento ferreira' 'monte belo' 'gurigica' 'jucutuquara'
 'praia do canto' 'consolacao' 'cruzamento' 'bela vista' 'centro'
 'santa lucia' 'ilha de santa maria' 'jardim camburi' 'jardim da penha'
 'santa clara' 'bonfim' 'jesus de nazareth' 'jabour' 'sao jose'
 'sao pedro' 'santo antonio' 'maria ortiz' 'itarare' 'santa tereza'
 'universitario' 'inhangueta' 'ilha do principe' 'romao' 'santa martha'
 'andorinhas' 'santa luiza' 'da penha' 'do quadro' 'parque moscoso'
 'grande vitoria' 'forte sao joao' "joana d'arc" 'de lourdes'
 'santos reis' 'ariovaldo favalessa' 'horto' 'fonte grande' 'goiabeiras'
 'sao benedito' 'nova palestina' 'enseada do sua' 'do cabral' 'piedade'
 'republica' 'do moscoso' 'ilha das caieiras' 'f

# Derived Features

In [6]:
def calculate_prior_noshow(row):
    previous_appoint = df.loc[(df.patient_id == row["patient_id"]) & (df.appointment_day <= row["scheduled_day"]), "show"]
    row["previous_appoint_count"] = len(previous_appoint)
    row["previous_appoint_shows"] = previous_appoint.sum()
    return row

In [7]:
df["days_delta"] = (df.appointment_day - pd.to_datetime(df.scheduled_day.dt.date)).dt.days
df = df.apply(calculate_prior_noshow, axis=1)
df = df.drop(["patient_id", "appointment_id"], axis=1)
df.age = df.age.astype("int")
df.loc[df.age < 0, "age"] = int(df.age.mode())
df.loc[df.days_delta < 0, "days_delta"] = int(df.days_delta.mode())

In [8]:
df.head()

Unnamed: 0,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,...,neigh_income_range_3,neigh_income_range_4,neigh_income_range_5,neigh_income_range_6,neigh_income_range_7,neigh_income_range_0,show,days_delta,previous_appoint_count,previous_appoint_shows
0,F,2015-11-10 07:13:56,2016-05-04,51,resistencia,False,False,False,False,0,...,0.254717,0.08037,0.008708,0.00127,0.000363,0.36484,True,176,0,0
1,M,2015-12-03 08:17:28,2016-05-02,34,vila rubim,False,True,False,False,0,...,0.224314,0.179608,0.063529,0.012549,0.000784,0.308235,False,151,0,0
2,F,2015-12-07 10:40:59,2016-06-03,27,sao cristovao,True,False,False,False,0,...,0.225615,0.190759,0.062416,0.009727,0.002162,0.310997,False,179,0,0
3,F,2015-12-07 10:42:42,2016-06-03,48,maruipe,False,True,True,False,0,...,0.184203,0.250071,0.132022,0.032221,0.007129,0.273453,True,179,0,0
4,F,2015-12-07 10:43:01,2016-06-03,80,sao cristovao,False,True,True,False,0,...,0.225615,0.190759,0.062416,0.009727,0.002162,0.310997,True,179,0,0


# Train Test Split

In [9]:
one_hot_features = pd.get_dummies(df.drop(["show", "neighbourhood", "alcoholism", "gender",'scheduled_day', 'appointment_day','neigh_income_range_1', 'neigh_income_range_2',
       'neigh_income_range_3', 'neigh_income_range_4', 'neigh_income_range_5',
       'neigh_income_range_6', 'neigh_income_range_7', 'neigh_income_range_0'], axis=1)).columns


In [10]:
one_hot_features

Index(['age', 'scholarship', 'hypertension', 'diabetes', 'handicap',
       'sms_received', 'neigh_mean_income', 'days_delta',
       'previous_appoint_count', 'previous_appoint_shows'],
      dtype='object')

In [11]:
X = pd.get_dummies(df.drop(["show", "neighbourhood", "alcoholism", "gender",'scheduled_day', 'appointment_day','neigh_income_range_1', 'neigh_income_range_2',
       'neigh_income_range_3', 'neigh_income_range_4', 'neigh_income_range_5',
       'neigh_income_range_6', 'neigh_income_range_7', 'neigh_income_range_0'], axis=1)).values
y = df.show.values
X = X.astype("float64")
y = y.astype("float64")

X = df[one_hot_features]
y = df["show"]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=7, shuffle = True)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=.5, random_state=7, shuffle = True)

# Model definition

In [49]:
rf = RandomForestClassifier()
model_rf = rf.fit(X_train,y_train)
y_pred_rf = model_rf.predict(X_test)



In [50]:
rf_cv_score = cross_val_score(model_rf, X, y, cv=10, scoring='roc_auc')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest Classifier : ", rf_cv_score.mean())

cm = confusion_matrix(y_test, y_pred_rf.round())
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Confusion Matrix :\n', cm)

=== Mean AUC Score ===
Mean AUC Score - Random Forest Classifier :  0.6487779084248657
Confusion Matrix :
 [[0.28912387 0.71087613]
 [0.12291808 0.87708192]]


In [13]:
xgbc = XGBClassifier(n_estimators=400, scale_pos_weight=((y_train == 0).sum() / y_train.sum()))
model_xgb = xgbc.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

In [125]:
model_xgb = xgbc.fit(X_train.as_matrix(), y_train.as_matrix())

  """Entry point for launching an IPython kernel.


In [126]:
xgb_cv_score = cross_val_score(model_xgb, X, y, cv=10, scoring='roc_auc')
print("=== Mean AUC Score ===")
print("Mean AUC Score - XGB Classifier : ", xgb_cv_score.mean())

cm = confusion_matrix(y_test, y_pred_xgb.round())
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Confusion Matrix :\n', cm)

=== Mean AUC Score ===
Mean AUC Score - XGB Classifier :  0.7108830226915961
Confusion Matrix :
 [[0.87039275 0.12960725]
 [0.47117341 0.52882659]]


In [29]:
model_xgb.predict_proba(X_val.as_matrix()[0].reshape(1, 10))

  """Entry point for launching an IPython kernel.


array([[0.71237177, 0.28762823]], dtype=float32)

In [123]:
inp = np.array([55, 0,0 ,0, 0, 1, 300, 0,0 ,0 ])

In [124]:
model_xgb.predict_proba(inp.reshape(1, 10))

array([[0.9829217, 0.0170783]], dtype=float32)

In [40]:
dict = {'age' : [13],
       'scholarship': True,
       'hypertension': False, 
       'diabetes': False, 
       'handicap': 0,
       'sms_received': True, 
        'neigh_mean_income' : 510, 
        'days_delta': 13,
       'previous_appoint_count':0, 
        'previous_appoint_shows':0} 

In [41]:
df_inp = pd.DataFrame(dict, columns=['age', 'scholarship','hypertension','diabetes','handicap','sms_received','neigh_mean_income','days_delta','previous_appoint_count','previous_appoint_shows'])

In [42]:
df_inp

Unnamed: 0,age,scholarship,hypertension,diabetes,handicap,sms_received,neigh_mean_income,days_delta,previous_appoint_count,previous_appoint_shows
0,13,True,False,False,0,True,510,13,0,0


In [43]:
model_xgb.predict_proba(df_inp)[0][0]

0.71237177

In [22]:
dict_features_val =  {'age' : [13],
       'scholarship': True,
       'hypertension': False, 
       'diabetes': False, 
       'handicap': [0],
       'sms_received': True, 
        'neigh_mean_income' : [510], 
        'days_delta': [13],
       'previous_appoint_count':[0], 
        'previous_appoint_shows':[0]} 

In [23]:
dict_features = ['age', 'scholarship','hypertension','diabetes','handicap','sms_received','neigh_mean_income','days_delta','previous_appoint_count','previous_appoint_shows']
df_inp = pd.DataFrame(dict_features_val, columns=dict_features)
model_xgb.predict_proba(df_inp)[0][0]

0.71237177

# Generate pickle file with XGBClassifier

In [34]:
#Saving the model to disk
pickle.dump(model_xgb, open('model_f.pkl', 'wb'))

#Loading model to compare the results
model = pickle.load(open('model_f.pkl', 'rb'))