In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.metrics import accuracy_score
pd.options.display.max_rows = 4000

In [None]:
df = pd.read_csv('./canadian-hospital-re-admittance-challenge/train.csv')
test_df = pd.read_csv('./canadian-hospital-re-admittance-challenge/test.csv')
df

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)

percentages_df = pd.DataFrame({
    'percent_missing': percent_missing
})

percentages_df

In [None]:
max_glu_serum_group = df.groupby(by=['max_glu_serum'])
max_glu_serum_group.count()['enc_id']

In [None]:
a1c_result_group = df.groupby(by=['A1Cresult'])
a1c_result_group.count()['enc_id']

In [None]:
specialty_group = df.groupby(by=['medical_specialty'])
specialty_group.count()['enc_id']

In [None]:
race_group = df.groupby(by=['race'])
race_group.count()['enc_id']

In [None]:
df["max_glu_serum"].fillna("Norm", inplace = True)
df["A1Cresult"].fillna("Norm",inplace=True)
df["medical_specialty"].fillna("No-Admitting-Physician", inplace=True)
df["race"].fillna("Other", inplace=True)

test_df["max_glu_serum"].fillna("Norm", inplace = True)
test_df["A1Cresult"].fillna("Norm",inplace=True)
test_df["medical_specialty"].fillna("No-Admitting-Physician", inplace=True)
test_df["race"].fillna("Other", inplace=True)

In [None]:
df.drop(columns=['enc_id','patient_id','weight','payer_code'],inplace=True)
test_df.drop(columns=['enc_id','patient_id','weight','payer_code'],inplace=True)

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)

percentages_df = pd.DataFrame({
    'percent_missing': percent_missing
})

percentages_df

In [None]:
df.loc[df['diag_1'].notnull(), 'diag_1'] = 4
df.loc[df['diag_2'].notnull(), 'diag_2'] = 2
df.loc[df['diag_3'].notnull(), 'diag_3'] = 1

df['diag_1'].fillna(0,inplace=True)
df['diag_2'].fillna(0,inplace=True)
df['diag_3'].fillna(0,inplace=True)

test_df.loc[test_df['diag_1'].notnull(), 'diag_1'] = 4
test_df.loc[test_df['diag_2'].notnull(), 'diag_2'] = 2
test_df.loc[test_df['diag_3'].notnull(), 'diag_3'] = 1

test_df['diag_1'].fillna(0,inplace=True)
test_df['diag_2'].fillna(0,inplace=True)
test_df['diag_3'].fillna(0,inplace=True)

df.loc[:,'diag_1':'diag_3']

In [None]:
new_col = df['diag_1']+df['diag_2']+df['diag_3']
df.insert(loc = len(df.columns)-1, column = 'Number_of_Diagnosis', value=new_col)

new_col = test_df['diag_1']+test_df['diag_2']+test_df['diag_3']
test_df.insert(loc = len(test_df.columns), column = 'Number_of_Diagnosis', value=new_col)


In [None]:
diag_groups = df.groupby(by=['Number_of_Diagnosis'])
diag_groups.count()

In [None]:
df.drop(columns=['diag_1','diag_2','diag_3'], inplace=True)
test_df.drop(columns=['diag_1','diag_2','diag_3'], inplace=True)

In [None]:
admission_grouping_dict = {
    1 : [1],
    2 : [2],
    3 : [3],
    4 : [4],
    5 : [5,6,8], 
    6 : [7]
}

def admission_group(row):
    for j in admission_grouping_dict.keys():
        if row['admission_type_id'] in admission_grouping_dict[j]:
            return j

new_col = df.apply(admission_group, axis=1)
df.insert(loc = len(df.columns)-1, column = 'admission_type_id_new', value=new_col)
df.drop(columns=['admission_type_id'], inplace=True)

new_col = test_df.apply(admission_group, axis=1)
test_df.insert(loc = len(test_df.columns), column = 'admission_type_id_new', value=new_col)
test_df.drop(columns=['admission_type_id'], inplace=True)

temp_df = df.groupby(by=['admission_type_id_new'])
temp_df.count()

In [None]:
discharge_grouping_dict = {
    1 : [11,19,20,21],
    2 : [18,25,26],
    3 : [7],
    4 : [1,6,8,13,14],
    5 : [2,3,4,5,10,16,22,23,24,30,27,28,29],
    6 : [9,12,15,17]
}

def discharge_group(row):
    for j in discharge_grouping_dict.keys():
        if row['discharge_disposition_id'] in discharge_grouping_dict[j]:
            return j
        
new_col = df.apply(discharge_group, axis=1)
df.insert(loc = len(df.columns)-1, column = 'discharge_type_id_new', value=new_col)
df.drop(columns=['discharge_disposition_id'], inplace=True)

new_col = test_df.apply(discharge_group, axis=1)
test_df.insert(loc = len(test_df.columns), column = 'discharge_type_id_new', value=new_col)
test_df.drop(columns=['discharge_disposition_id'], inplace=True)

temp_df = df.groupby(by=['discharge_type_id_new'])
temp_df.count()

In [None]:
source_grouping_dict = {
    1 : [4,5,6,10,18,22,25,26],
    2 : [1,2,3],
    3 : [11,12,13,14],
    4 : [9,15,17,20,21],
    5 : [7],
    6 : [8]
}

def source_group(row):
    for j in source_grouping_dict.keys():
        if row['admission_source_id'] in source_grouping_dict[j]:
            return j

new_col = df.apply(source_group, axis=1)
df.insert(loc = len(df.columns)-1, column = 'admission_source_id_new', value=new_col)
df.drop(columns=['admission_source_id'], inplace=True)

new_col = test_df.apply(source_group, axis=1)
test_df.insert(loc = len(test_df.columns), column = 'admission_source_id_new', value=new_col)
test_df.drop(columns=['admission_source_id'], inplace=True)

temp_df = df.groupby(by=['admission_source_id_new'])
temp_df.count()

In [None]:
for col in df.loc[:,'metformin':'diabetesMed']:
    med_groups = df.groupby(by=[col])
    print(med_groups.count().iloc[:, 0])

In [None]:
df.drop(columns=['chlorpropamide', 'tolbutamide', 'miglitol', 'acarbose', 'tolazamide', 'acetohexamide', 'troglitazone', 'examide', 'citoglipton', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'glyburide-metformin'], inplace=True)
test_df.drop(columns=['chlorpropamide', 'tolbutamide', 'miglitol', 'acarbose', 'tolazamide', 'acetohexamide', 'troglitazone', 'examide', 'citoglipton', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'glyburide-metformin'], inplace=True)
df.columns.size

In [None]:
def count_changes(row):
    count =0
    for col in ['metformin','repaglinide','nateglinide','glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone','insulin']:
        if(row[col]=='Up' or row[col]=='Down'):
            count+=1
    if(row['change']=='change'):count+=1
    return count

new_col = df.apply(count_changes, axis=1)
df.insert(loc = len(df.columns)-1, column = 'changes', value=new_col)

new_col = test_df.apply(count_changes, axis=1)
test_df.insert(loc = len(test_df.columns), column = 'changes', value=new_col)


In [None]:
df.drop(columns=['metformin','repaglinide','nateglinide','glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone','insulin','change'], inplace=True)
test_df.drop(columns=['metformin','repaglinide','nateglinide','glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone','insulin','change'], inplace=True)

df.columns.size


In [None]:
plt.figure(figsize=(20,20))
corr = df.loc[:, ["time_in_hospital","num_lab_procedures","num_procedures","num_medications","number_outpatient","number_emergency","number_inpatient","number_diagnoses","changes", "readmission_id"]].corr()
sns.heatmap(corr,annot=True)

In [None]:
# for col in df.loc[:,'metformin':'diabetesMed']:
#     df_grouped = df.groupby(by=col)['readmission_id'].value_counts(normalize=True).unstack('readmission_id')
#     df_grouped.plot.bar(stacked=True)

In [None]:
df_grouped = df.groupby(by=['race'])['readmission_id'].value_counts(normalize=True).unstack('readmission_id')
df_grouped.plot.bar(stacked=True)

In [None]:
df_grouped = df.groupby(by=['gender'])['readmission_id'].value_counts(normalize=True).unstack('readmission_id')
df_grouped.plot.bar(stacked=True)

In [None]:
df_grouped = df.groupby(by=['age'])['readmission_id'].value_counts(normalize=True).unstack('readmission_id')
df_grouped.plot.bar(stacked=True)

In [None]:
df.columns

In [None]:
test_df.columns

In [None]:
input = df.loc[:, "race":"changes"]
labels = df.loc[:, "readmission_id"]
input.columns

In [None]:
input_encoded = pd.get_dummies(input, columns=['race', 'gender', 'age',
       'medical_specialty', 'max_glu_serum', 'A1Cresult', 'diabetesMed', 'Number_of_Diagnosis',
       'admission_type_id_new', 'discharge_type_id_new', 'admission_source_id_new'])

print(input_encoded.columns)

In [None]:
test_encoded = pd.get_dummies(test_df, columns=['race', 'gender', 'age',
       'medical_specialty', 'max_glu_serum', 'A1Cresult', 'diabetesMed', 'Number_of_Diagnosis',
       'admission_type_id_new', 'discharge_type_id_new', 'admission_source_id_new'])

print(test_encoded.columns)

In [None]:
for i in input_encoded.columns:
    if i not in test_encoded.columns:
       test_encoded[i] = 0

for i in test_encoded.columns:
    if i not in input_encoded.columns:
       test_encoded.drop(columns=[i], inplace=True)

input_encoded.sort_index(axis=1, inplace=True)
test_encoded.sort_index(axis=1, inplace=True)

In [None]:
print(input_encoded.columns)
print(test_encoded.columns)

In [None]:
input_encoded1 = input_encoded.loc[:, :]

In [None]:
scaler = StandardScaler()
input_encoded1[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']] = scaler.fit_transform(input_encoded1[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].to_numpy())

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(input_encoded, labels, test_size=0.25, random_state=42)

In [None]:
lr = LogisticRegression(random_state=42, multi_class="multinomial")
lr.fit(X_train,Y_train)

y_pred = lr.predict(X_test)
print(accuracy_score(y_pred, Y_test))

In [None]:
nb = GaussianNB()
nb.fit(X_train,Y_train)

y_pred = nb.predict(X_test)
print(accuracy_score(y_pred, Y_test))

In [None]:
tree = DecisionTreeClassifier(max_depth=20,random_state=42)
tree.fit(X_train,Y_train)

y_pred = tree.predict(X_test)
print(accuracy_score(y_pred, Y_test))

In [None]:
rf = RandomForestClassifier(n_estimators= 100,max_depth=20,random_state=42)
rf.fit(X_train,Y_train)

y_pred = rf.predict(X_test)
print(accuracy_score(y_pred, Y_test))

In [None]:
gbc = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=4,random_state=42)
gbc.fit(X_train,Y_train)

y_pred = gbc.predict(X_test)
print(accuracy_score(y_pred, Y_test))

In [None]:
test_Y = gbc.predict(test_encoded)

df_output = pd.read_csv("./canadian-hospital-re-admittance-challenge/sample_submission.csv")
df_output["readmission_id"] = test_Y
df_output.to_csv("submission.csv", index=False)