### Reduction of Hospital Readmission of Diabetic Patients within 30 days

In [None]:
import pandas as pd
import numpy as np

In [None]:
import os

#### Reading data set into variable called 'data'

In [None]:
data =pd.read_csv("../input/diabetic_data.csv")

In [None]:
data.keys()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

#### Numeric columns have no NULL values

#### Missing information is represented by '?' in few columns of dtype 'object'
#### Counting number of rows with missing information i.e., '?' in each column

In [None]:
for column in data.columns:
    if data[column].dtype == object:
        print(column,data[column][data[column]== '?'].count())    

#### Dropping columns 'encounter_id', 'patient_nbr' ,'payer_code' that are unnecessary for the model
#### Dropping columns 'weight','medical_specialty' whose values are '?' above  90% and 40% respectively 

In [None]:
data.drop(['encounter_id', 'patient_nbr', 'weight','medical_specialty', 'payer_code','admission_source_id' ], axis=1, inplace= True)

In [None]:
data['gender'].value_counts()

#### Removing 3 rows with gender values 'Unknown/Invalid'

In [None]:
data = data[data.gender != 'Unknown/Invalid']

#### Removing rows with discharge_disposition_id 11,19,20,21 which belong to category "Expired" and id 7 which is of patients that left against medical advice

In [None]:
data = data[(data.discharge_disposition_id != 11) & (data.discharge_disposition_id != 19) & (data.discharge_disposition_id != 20) & (data.discharge_disposition_id != 21) & (data.discharge_disposition_id != 7) ]

#### Removing rows with missing information in all 3 diagnosis

In [None]:
data = data[(data.diag_1 != '?') | (data.diag_2 != '?') | (data.diag_3 != '?')]

#### Dropping all columns related to medicines except insulin,metformin,glimepiride,repaglinide,pioglitazone,acarbose,glipizide, glyburide ,nateglinide (which is widely used diabetic medicine) as there is a column "diabetesMed" which tells if a patient is using diabetes medicine or not

In [None]:
data.drop(['chlorpropamide','acetohexamide', 'tolbutamide', 'rosiglitazone', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'glyburide-metformin', 'glipizide-metformin','glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone'],axis=1, inplace= True)

##### Dropping columns diag_1, diag_2, diag_3 because insufficiet information

In [None]:
data.drop(['diag_1','diag_2','diag_3'], axis=1, inplace= True)

In [None]:
data.race.value_counts()

#### Replacing '?' with NA and considering NA as a category

In [None]:
data["Race"]= data["race"].map(lambda x:'NA' if x=='?' else x)
data.drop(['race'], axis=1, inplace= True)

#### admission_type_ids 1,2,7 are grouped as Non-Elective category 3,4 as Elective category and other ids which fall under missing information are grouped as 'NA'

In [None]:
def get_fn(row):
    if row['admission_type_id']==1 or row['admission_type_id']==2 or row['admission_type_id']==7 :
        return "Non ELective"
    elif row['admission_type_id']==3 or row['admission_type_id']==4:
        return "Elective"
    else :
        return "NA"
data['admission_type']= data.apply(get_fn,axis=1)
data.drop(['admission_type_id'], axis=1, inplace= True)

In [None]:
data.readmitted.value_counts()

#### Our target is to reduce readmission <30. So considering >30 and NO as 0 and <30 as 1

In [None]:
def fn(x):
    if x =='NO' or x=='>30':
        return 0
    else :
        return 1
data['readmit']= data['readmitted'].map(fn)
data.drop(['readmitted'], axis=1, inplace= True)

In [None]:
data['A1Cresult'].value_counts()

#### 'None' & 'Norm' of A1C is 0 and >8,>7 is 1

In [None]:
def fun(z):
    if z =='None' or z=='Norm':
        return 1
    else :
        return 0
data['A1C']= data['A1Cresult'].map(fun)
data.drop(['A1Cresult'], axis=1, inplace= True)

#### Dividing age groups in to three categories 'young','mid','old'

In [None]:
def gt_ag(a):
    if a =='[0-10)' or a=='[10-20)' or a=='[20-30)':
        return 'young'
    elif a =='[30-40)' or a=='[40-50)' or a=='[50-60)':
        return 'mid'
    else:
        return'old'
data['Age']= data['age'].map(gt_ag)
data.drop(['age'], axis=1, inplace= True)

In [None]:
data['max_glu_serum'].value_counts()

#### 'None' & 'Norm' of max_glu_serum is 0 and >200,>300 is 1

In [None]:
data['max_glu_serum']=data['max_glu_serum'].replace('None',0)
data['max_glu_serum']=data['max_glu_serum'].replace('Norm',0)
data['max_glu_serum']=data['max_glu_serum'].replace('>200',1)
data['max_glu_serum']=data['max_glu_serum'].replace('>300',1)

#### Dividing discharge_disposition_id to three values "Discharged Home", "NA", "Discharged/Transferred"

In [None]:
def dp_id(a):
    if a ==6 or a==8 or a==9 or a==13 or a==1:
        return 'Discharged Home'
    elif a==18 or a ==25 or a==26 :
        return 'NA'
    else:
        return'Discharged/Transferred'
data['discharge']= data['discharge_disposition_id'].map(dp_id)
data.drop(['discharge_disposition_id'], axis=1, inplace= True)

# Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#### Plotting count of target variable 'readmit'

In [None]:
sns.countplot(x='readmit',data=data, palette='hls')
plt.savefig('admit-readmit')
plt.show()

In [None]:
count_0 =len(data[data['readmit']==0])
count_1 = len(data['readmit'])-count_0
prctg_0 = count_0/len(data['readmit'])
prctg_1 = count_1/len(data['readmit'])
print("percentage of readmission", prctg_1*100)
print("percentage of no readmission", prctg_0*100)

#### There no balance in the ratio of values of target variable 'readmit'

In [None]:
pd.crosstab(data.diabetesMed,data.readmit).plot(kind='bar')
plt.title('diabetesMed Vs readmit')
plt.xlabel('diabetesMed')
plt.ylabel('count of readmits')
plt.savefig('diabetesMed vs readmit')

In [None]:
pd.crosstab(data.Age,data.readmit).plot(kind='line')
plt.title('Age Vs readmit')
plt.xlabel('Age')
plt.ylabel('count of readmits')
plt.savefig('Age vs readmit')

In [None]:
pd.crosstab(data.gender,data.readmit).plot(kind='bar')
plt.title('gender Vs readmit')
plt.xlabel('gender')
plt.ylabel('count of readmits')
plt.savefig('gender vs readmit')

#### Gender doesn't seem to add much value to readmission rate

#### There might be more chance for patients under diabetes medication to readmit than those who are not uder medication

#### Creating dummy encoded columns for categorical vaiables
#### To avoid dummy variable trap i.e., scenario of highly correlated independent variables we have to drop one of the created dummy variable. The Original column should also be dropped as we have dummies

In [None]:
dummy_metformin = pd.get_dummies(data['metformin'], prefix='metformin')
data= data.join(dummy_metformin.drop("metformin_No", axis=1))
data.drop(['metformin'], axis=1, inplace= True)

In [None]:
dummy_repaglinide = pd.get_dummies(data['repaglinide'], prefix='repaglinide')
data= data.join(dummy_repaglinide.drop("repaglinide_No", axis=1))
data.drop(['repaglinide'], axis=1, inplace= True)

In [None]:
dummy_insulin = pd.get_dummies(data['insulin'], prefix='insulin')
data= data.join(dummy_insulin.drop("insulin_No", axis=1))
data.drop(['insulin'], axis=1, inplace= True)

In [None]:
dummy_nateglinide = pd.get_dummies(data['nateglinide'], prefix='nateglinide')
data= data.join(dummy_nateglinide.drop("nateglinide_No", axis=1))
data.drop(['nateglinide'], axis=1, inplace= True)

In [None]:
dummy_glimepiride = pd.get_dummies(data['glimepiride'], prefix='glimepiride')
data= data.join(dummy_glimepiride.drop("glimepiride_No", axis=1))
data.drop(['glimepiride'], axis=1, inplace= True)

In [None]:
dummy_glipizide = pd.get_dummies(data['glipizide'], prefix='glipizide')
data= data.join(dummy_glipizide.drop("glipizide_No", axis=1))
data.drop(['glipizide'], axis=1, inplace= True)

In [None]:
dummy_glyburide = pd.get_dummies(data['glyburide'], prefix='glyburide')
data= data.join(dummy_glyburide.drop("glyburide_No", axis=1))
data.drop(['glyburide'], axis=1, inplace= True)

In [None]:
dummy_pioglitazone = pd.get_dummies(data['pioglitazone'], prefix='pioglitazone')
data= data.join(dummy_pioglitazone.drop("pioglitazone_No", axis=1))
data.drop(['pioglitazone'], axis=1, inplace= True)

In [None]:
dummy_acarbose = pd.get_dummies(data['acarbose'], prefix='acarbose')
data= data.join(dummy_acarbose.drop("acarbose_No", axis=1))
data.drop(['acarbose'], axis=1, inplace= True)

In [None]:
dummy_gender = pd.get_dummies(data['gender'], prefix='gender')
data= data.join(dummy_gender.drop("gender_Female", axis=1))
data.drop(['gender'], axis=1, inplace= True)

In [None]:
dummy_admission = pd.get_dummies(data['admission_type'], prefix='admission')
data= data.join(dummy_admission.drop("admission_NA", axis=1))
data.drop(['admission_type'], axis=1, inplace= True)

In [None]:
dummy_change = pd.get_dummies(data['change'], prefix='change')
data= data.join(dummy_change.drop("change_Ch", axis=1))
data.drop(['change'], axis=1, inplace= True)

In [None]:
dummy_Age = pd.get_dummies(data['Age'], prefix='Age')
data= data.join(dummy_Age.drop("Age_mid", axis=1))
data.drop(['Age'], axis=1, inplace= True)

In [None]:
dummy_diabetesMed = pd.get_dummies(data['diabetesMed'], prefix='diabetesMed')
data= data.join(dummy_diabetesMed.drop("diabetesMed_No", axis=1))
data.drop(['diabetesMed'], axis=1, inplace= True)

In [None]:
dummy_race = pd.get_dummies(data['Race'], prefix='Race')
data= data.join(dummy_race.drop("Race_Other", axis=1))
data.drop(['Race'], axis=1, inplace= True)

In [None]:
data.discharge.value_counts()

In [None]:
dummy_discharge = pd.get_dummies(data['discharge'], prefix='discharge')
data= data.join(dummy_discharge.drop("discharge_NA", axis=1))
data.drop(['discharge'], axis=1, inplace= True)

# Logistic Regression Model

In [None]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

#### Splitting the data set into Test and Train data¶

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data.drop('readmit', axis=1), data['readmit'], test_size=0.2, random_state=12)

#### Adding constant to Train data

In [None]:
X_train = sm.add_constant(X_train)

In [None]:
X_test.shape

In [None]:
X_train.shape

#### Fitting Logistic Regression Model to data

In [None]:
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

#### STEP WISE REGRESSION - Back ward Elimination

#### Step wise removal of variables that are not significant i.e., in our case variables with P > 0.05

In [None]:
X_train.drop(['glyburide_Up','glyburide_Down','glyburide_Steady'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['acarbose_Down','acarbose_Steady','acarbose_Up'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['nateglinide_Down','nateglinide_Steady','nateglinide_Up'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['number_outpatient'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['Age_young'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['glimepiride_Up','glimepiride_Down'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['pioglitazone_Up','pioglitazone_Steady','pioglitazone_Down'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['glipizide_Steady','glipizide_Up'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['Race_AfricanAmerican','Race_Asian','Race_Caucasian','Race_Hispanic','Race_NA'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['insulin_Steady','insulin_Up'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['change_No'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['admission_Elective','admission_Non ELective'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['repaglinide_Down','repaglinide_Up','repaglinide_Steady'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['metformin_Down'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['max_glu_serum'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['gender_Male'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['num_procedures'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

In [None]:
X_train.drop(['time_in_hospital'], axis=1, inplace=True)
model= sm.GLM(Y_train, X_train, family=sm.families.Binomial()).fit()
print(model.summary2())

#### Now all the variables are significant and also we observe that deviance and aic are improved. This is considered as Final model

#### Testing the accuracy of the model using test data set

In [None]:
X_test = sm.add_constant(X_test[['num_lab_procedures','num_medications','number_emergency','number_inpatient','number_diagnoses','A1C','metformin_Steady','metformin_Up','insulin_Down','glimepiride_Steady','glipizide_Down','Age_old','diabetesMed_Yes','discharge_Discharged Home','discharge_Discharged/Transferred']])

In [None]:
probabilities = model.predict(X_test)

In [None]:
probabilities.head()

#### Setting a cutoff probability of 0.1 to make predictions if the passenger survived or not on the test data set.


In [None]:
predicted_classes = probabilities.map(lambda x: 1 if x > 0.1 else 0)

In [None]:
predicted_classes.head()

In [None]:
accuracy = sum(predicted_classes == Y_test) / len(Y_test)
accuracy

#### Plotting an ROC curve and confusion matrix

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
%matplotlib inline

In [None]:
confusion_mat = confusion_matrix(Y_test, predicted_classes)
confusion_df = pd.DataFrame(confusion_mat, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
confusion_df

#### Though the accuracy of model is less, It is predicting 1's that is readmissions more accurate than no readmissions

In [None]:
_=sns.heatmap(confusion_df, cmap='coolwarm', annot=True)

In [None]:
probs = model.predict(X_test)

In [None]:
auc = roc_auc_score(Y_test, probs)
print('AUC',auc)

In [None]:
fpr, tpr, threshold = roc_curve(Y_test, probs)

In [None]:
plt.title('ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('ROC')
plt.show()

#### precision_score, recall_score, f1_score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
precision_score(Y_test, predicted_classes)

In [None]:
recall_score(Y_test, predicted_classes)

In [None]:
f1_score(Y_test, predicted_classes)

#### finding optimal cutoff probability

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = threshold[optimal_idx]

In [None]:
optimal_threshold

In [None]:
new_predictions = np.where(probs>optimal_threshold, 1, 0)

In [None]:
new_confusion_mat = confusion_matrix(Y_test, new_predictions)
new_confusion_df = pd.DataFrame(new_confusion_mat, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
new_confusion_df

In [None]:
_=sns.heatmap(new_confusion_df, cmap='coolwarm', annot=True)

#### New accuracy, precision_scor,recall_score,f1_score

In [None]:
accuracy = sum(new_predictions == Y_test) / len(Y_test)
accuracy

In [None]:
precision_score(Y_test, new_predictions)

In [None]:
recall_score(Y_test, new_predictions)

In [None]:
f1_score(Y_test, new_predictions)

# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

#### Fitting Radom Forest model to data

In [None]:
rf_model = RandomForestClassifier(n_estimators=10,max_depth=25,min_samples_split=3)

In [None]:
rf_model.fit(X_train, Y_train)

#### Predicting and evaluating the model

In [None]:
predictions = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(Y_test, predictions)

In [None]:
precision_score(Y_test, predictions)

In [None]:
confusion_mat = confusion_matrix(Y_test, predictions)
confusion_df = pd.DataFrame(confusion_mat, index=['Actual 0','Actual 1'],\
                            columns=['Predicted 0','Predicted 1'])
confusion_df

#### Plotting important features and their level of importance

In [None]:
feature_list = X_train.columns
features = rf_model.feature_importances_
most_imp = pd.DataFrame([a for a in zip(feature_list,features)], columns=["Feature", "Importance"]).nlargest(10, "Importance")
most_imp.sort_values(by="Importance", inplace=True)
plt.figure(figsize=(10,6))
plt.barh(range(len(most_imp)), most_imp.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp)), most_imp.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Most important features')
plt.savefig('Most imp')
plt.show()