In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot
%matplotlib inline

In [2]:
df_org = pd.read_csv('data.csv')
df_org

Unnamed: 0,Year,MthAndYrCode,Trust,New_Hospital,Type,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum
0,2008/09,Apr-08,Belfast,Belfast City,Type 1,3391.00,473,1,3865.00
1,2008/09,Apr-08,Belfast,Mater,Type 1,3142.00,486,0,3628.00
2,2008/09,Apr-08,Belfast,Royal Victoria & Royal Victoria (ENT & RAES),Type 1,5224.00,862,0,6086.00
3,2008/09,Apr-08,Belfast,RBHSC,Type 1,2480.00,398,0,2878.00
4,2008/09,Apr-08,Northern,Antrim Area,Type 1,4728.00,667,10,5405.00
...,...,...,...,...,...,...,...,...,...
2360,2018/19,Jun-18,Southern,Daisy Hill,Type 1,3736.00,1340.00,18,5094.00
2361,2018/19,Jun-18,Southern,South Tyrone,Type 3,3389.00,0,0,3389.00
2362,2018/19,Jun-18,Western,Altnagelvin Area,Type 1,4394.00,1287.00,36,5717.00
2363,2018/19,Jun-18,Western,South West Acute,Type 1,2130.00,967,27,3124.00


In [3]:
df_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2365 entries, 0 to 2364
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Year              2365 non-null   object
 1   MthAndYrCode      2365 non-null   object
 2   Trust             2365 non-null   object
 3   New_Hospital      2365 non-null   object
 4   Type              2365 non-null   object
 5   FourAndUnder_sum  2365 non-null   object
 6   FiveToTwelve_sum  2365 non-null   object
 7   OverTwelve_sum    2365 non-null   int64 
 8   Total_sum         2365 non-null   object
dtypes: int64(1), object(8)
memory usage: 166.4+ KB


In [4]:
# converting the datatypes to float for numerical featues
def convert_to_float(x):
    x = x.replace(",","")
    return float(x)

In [5]:
df_org['FiveToTwelve_sum'] = df_org['FiveToTwelve_sum'].apply(lambda x : convert_to_float(x))
df_org['FourAndUnder_sum'] = df_org['FourAndUnder_sum'].apply(lambda x : convert_to_float(x))
df_org['Total_sum'] = df_org['Total_sum'].apply(lambda x : convert_to_float(x))

In [6]:
# so the data contains no null value
# Now , we will conclude binary vaiable "wait under 4 hours" in terms of yes or no.


In [7]:
# Now lets calculate the number of chances for patients that waited less than or equal to 4 hours

In [8]:
df_org['wait_chances_less_than_4'] = (df_org['FourAndUnder_sum'] / df_org['Total_sum'])*100

In [9]:
df_org

Unnamed: 0,Year,MthAndYrCode,Trust,New_Hospital,Type,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4
0,2008/09,Apr-08,Belfast,Belfast City,Type 1,3391.0,473.0,1,3865.0,87.736093
1,2008/09,Apr-08,Belfast,Mater,Type 1,3142.0,486.0,0,3628.0,86.604190
2,2008/09,Apr-08,Belfast,Royal Victoria & Royal Victoria (ENT & RAES),Type 1,5224.0,862.0,0,6086.0,85.836346
3,2008/09,Apr-08,Belfast,RBHSC,Type 1,2480.0,398.0,0,2878.0,86.170952
4,2008/09,Apr-08,Northern,Antrim Area,Type 1,4728.0,667.0,10,5405.0,87.474561
...,...,...,...,...,...,...,...,...,...,...
2360,2018/19,Jun-18,Southern,Daisy Hill,Type 1,3736.0,1340.0,18,5094.0,73.341186
2361,2018/19,Jun-18,Southern,South Tyrone,Type 3,3389.0,0.0,0,3389.0,100.000000
2362,2018/19,Jun-18,Western,Altnagelvin Area,Type 1,4394.0,1287.0,36,5717.0,76.858492
2363,2018/19,Jun-18,Western,South West Acute,Type 1,2130.0,967.0,27,3124.0,68.181818


In [10]:
df_org.describe()

Unnamed: 0,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4
count,2365.0,2365.0,2365.0,2365.0,2365.0
mean,2463.158562,626.030444,28.427061,3117.616068,86.403159
std,1434.815032,766.147895,76.935804,2133.871181,13.23315
min,22.0,0.0,0.0,78.0,28.205128
25%,1254.0,0.0,0.0,1285.0,75.070423
50%,2249.0,241.0,0.0,2619.0,89.427127
75%,3389.0,1084.0,13.0,4480.0,100.0
max,6630.0,3609.0,745.0,8466.0,100.0


In [11]:
# So by seeing the mean of "wait_chances_less_than_4", we can set the threshold of 85% to reduce margin of error.

In [12]:
# Now computing binary variable (target class) in terms of patients that waited less than or equal to 4 hours. 
# 1 means yes
# 0 means no

In [13]:
def check_threshold(x):
    if(x<85):
        return 0
    else:
        return 1

df_org['Waited_Less_than_4_hours'] = df_org['wait_chances_less_than_4'].apply(lambda x :  check_threshold(x) )


In [14]:
df_org

Unnamed: 0,Year,MthAndYrCode,Trust,New_Hospital,Type,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4,Waited_Less_than_4_hours
0,2008/09,Apr-08,Belfast,Belfast City,Type 1,3391.0,473.0,1,3865.0,87.736093,1
1,2008/09,Apr-08,Belfast,Mater,Type 1,3142.0,486.0,0,3628.0,86.604190,1
2,2008/09,Apr-08,Belfast,Royal Victoria & Royal Victoria (ENT & RAES),Type 1,5224.0,862.0,0,6086.0,85.836346,1
3,2008/09,Apr-08,Belfast,RBHSC,Type 1,2480.0,398.0,0,2878.0,86.170952,1
4,2008/09,Apr-08,Northern,Antrim Area,Type 1,4728.0,667.0,10,5405.0,87.474561,1
...,...,...,...,...,...,...,...,...,...,...,...
2360,2018/19,Jun-18,Southern,Daisy Hill,Type 1,3736.0,1340.0,18,5094.0,73.341186,0
2361,2018/19,Jun-18,Southern,South Tyrone,Type 3,3389.0,0.0,0,3389.0,100.000000,1
2362,2018/19,Jun-18,Western,Altnagelvin Area,Type 1,4394.0,1287.0,36,5717.0,76.858492,0
2363,2018/19,Jun-18,Western,South West Acute,Type 1,2130.0,967.0,27,3124.0,68.181818,0


In [15]:
df_org.corr()

Unnamed: 0,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4,Waited_Less_than_4_hours
FourAndUnder_sum,1.0,0.796538,0.383234,0.972208,-0.698065,-0.663651
FiveToTwelve_sum,0.796538,1.0,0.559535,0.914808,-0.913809,-0.81023
OverTwelve_sum,0.383234,0.559535,1.0,0.494637,-0.537105,-0.419353
Total_sum,0.972208,0.914808,0.494637,1.0,-0.816839,-0.752265
wait_chances_less_than_4,-0.698065,-0.913809,-0.537105,-0.816839,1.0,0.886013
Waited_Less_than_4_hours,-0.663651,-0.81023,-0.419353,-0.752265,0.886013,1.0


In [16]:
#removing unneccasry columns
df_org.drop(labels=['Year','MthAndYrCode'],axis=1,inplace=True)


In [17]:
df_org

Unnamed: 0,Trust,New_Hospital,Type,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4,Waited_Less_than_4_hours
0,Belfast,Belfast City,Type 1,3391.0,473.0,1,3865.0,87.736093,1
1,Belfast,Mater,Type 1,3142.0,486.0,0,3628.0,86.604190,1
2,Belfast,Royal Victoria & Royal Victoria (ENT & RAES),Type 1,5224.0,862.0,0,6086.0,85.836346,1
3,Belfast,RBHSC,Type 1,2480.0,398.0,0,2878.0,86.170952,1
4,Northern,Antrim Area,Type 1,4728.0,667.0,10,5405.0,87.474561,1
...,...,...,...,...,...,...,...,...,...
2360,Southern,Daisy Hill,Type 1,3736.0,1340.0,18,5094.0,73.341186,0
2361,Southern,South Tyrone,Type 3,3389.0,0.0,0,3389.0,100.000000,1
2362,Western,Altnagelvin Area,Type 1,4394.0,1287.0,36,5717.0,76.858492,0
2363,Western,South West Acute,Type 1,2130.0,967.0,27,3124.0,68.181818,0


In [18]:
def oneHotEncoding_with_names(columnName):
    
    temp_df = pd.get_dummies(df_org[columnName],drop_first=True)
    for x in temp_df.columns:
        temp_df.rename(columns={x:columnName+'_'+str(x)},inplace=True)
    
    for x in temp_df.columns:
        df_org[x] = temp_df[x]
    
    df_org.drop(labels=columnName,axis=1,inplace=True)
    return df_org

for y in df_org.columns:
    if(df_org[y].dtype==object):
        oneHotEncoding_with_names(y)
        
df_org

Unnamed: 0,FourAndUnder_sum,FiveToTwelve_sum,OverTwelve_sum,Total_sum,wait_chances_less_than_4,Waited_Less_than_4_hours,Trust_Northern,Trust_South Eastern,Trust_Southern,Trust_Western,...,New_Hospital_Royal Victoria & Royal Victoria (ENT & RAES),New_Hospital_Royal Victoria (ENT & RAES),New_Hospital_Royal Victoria (RAES),New_Hospital_South Tyrone,New_Hospital_South West Acute,New_Hospital_Tyrone County,New_Hospital_Ulster,New_Hospital_Whiteabbey,Type_Type 2,Type_Type 3
0,3391.0,473.0,1,3865.0,87.736093,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3142.0,486.0,0,3628.0,86.604190,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5224.0,862.0,0,6086.0,85.836346,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2480.0,398.0,0,2878.0,86.170952,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4728.0,667.0,10,5405.0,87.474561,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,3736.0,1340.0,18,5094.0,73.341186,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2361,3389.0,0.0,0,3389.0,100.000000,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
2362,4394.0,1287.0,36,5717.0,76.858492,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2363,2130.0,967.0,27,3124.0,68.181818,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [19]:
# Check data balancing.
df_org['Waited_Less_than_4_hours'].value_counts()

1    1379
0     986
Name: Waited_Less_than_4_hours, dtype: int64

In [20]:
# So the data set is balance in terms of ratio 1379:986
# We dont need to oversampling the data to make an equivalent ratio

In [21]:
df_org.drop(labels=['FourAndUnder_sum','FiveToTwelve_sum','OverTwelve_sum','Total_sum','wait_chances_less_than_4'],axis=1,inplace=True)

In [22]:
# Checking important features
correlation_data = df_org.corr()

cols_to_be_selected = []
all_cols = correlation_data.columns
index = 0;

for x in correlation_data['Waited_Less_than_4_hours'].tolist():
    if(x >= -0.2 and x <=0.2):
        #neglecting the feature / column
        pass
    else:
        cols_to_be_selected.append(all_cols[index])
    index += 1

cols_to_be_selected

['Waited_Less_than_4_hours',
 'Trust_South Eastern',
 'New_Hospital_Antrim Area',
 'New_Hospital_Mater',
 'New_Hospital_Royal Victoria',
 'New_Hospital_Ulster',
 'Type_Type 2',
 'Type_Type 3']

In [23]:
features = df_org[[
 'Trust_South Eastern',
 'New_Hospital_Antrim Area',
 'New_Hospital_Mater',
 'New_Hospital_Royal Victoria',
 'New_Hospital_Ulster',
 'Type_Type 2',
 'Type_Type 3']]

target = df_org[['Waited_Less_than_4_hours']]



In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
rfc = RandomForestClassifier(n_estimators=120)

model = cross_validate(rfc,features,target,cv=10,scoring='f1',return_estimator=True)
model

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


{'estimator': [RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120),
  RandomForestClassifier(n_estimators=120)],
 'fit_time': array([0.26872373, 0.26429534, 0.25731111, 0.23939085, 0.28619766,
        0.24238133, 0.23091269, 0.26133943, 0.26529264, 0.24235153]),
 'score_time': array([0.0179565 , 0.02090788, 0.02094293, 0.01795888, 0.04089093,
        0.01795745, 0.01895022, 0.02090573, 0.02094102, 0.0179503 ]),
 'test_score': array([0.67307692, 0.78632479, 0.79148936, 0.85943775, 0.83794466,
        0.86507937, 0.888     , 0.88259109, 0.93632959, 0.93818182])}

In [25]:
# saving model
import pickle
final_model = model['estimator'][9]
print("Accuracy " + str(model['test_score'][9]))
pickle.dump(final_model, open('patient_wait_model.pkl', 'wb'))

Accuracy 0.9381818181818182


In [31]:
#Inputs
# trust is Southern East
# hospital Ards MUI
# type type-3


#Feautres:
# 'Trust_South Eastern',
#  'New_Hospital_Antrim Area',
#  'New_Hospital_Mater',
#  'New_Hospital_Royal Victoria',
#  'New_Hospital_Ulster',
#  'Type_Type 2',
#  'Type_Type 3'

In [30]:
final_model.predict([[1,0,0,0,0,0,1]])

array([1], dtype=int64)