In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


## Loading the training dataset

In [56]:
df = pd.read_csv("../train.csv")
df.drop(['payer_code', 'enc_id', 'patient_id'], axis=1, inplace=True)

In [57]:
df.drop_duplicates(inplace=True)

In [58]:
null_value_percentages=(df.isna().sum()/df.shape[0])*100

In [59]:
df.drop(['weight', 'max_glu_serum'], axis=1, inplace=True)

In [60]:
df.shape

(71236, 45)

- We drop `payer_code`, `enc_id`, `patient_id` columns as they have no effect on a patient being readmitted (common-sense).
- We drop `weight`, `max_glu_serum` columns as they have a high percentage of null values.
- DO WE DROP `A1Cresult` ????????????????????????????????????????????????????????
- DO WE DROP `medical_specialty` ????????????????????????????????????????????????????????
---
- We drop rows corresponding to null values for `race`, `diag_1`, `diag_2`, `diag_3` as they have very few null values (less than 2.5%)

In [61]:
# Dropping A1Cresult
df.drop(['A1Cresult'], axis=1, inplace=True)
df.shape

(71236, 44)

In [62]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [63]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

In [64]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

In [65]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
columns_to_encode.remove("time_in_hospital")
columns_to_encode.remove("num_lab_procedures")
columns_to_encode.remove("num_procedures")
columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
columns_to_encode.remove("number_diagnoses")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
medical_specialty   69
diag_1   685
diag_2   692
diag_3   746
metformin   4
repaglinide   4
nateglinide   4
chlorpropamide   3
glimepiride   4
acetohexamide   2
glipizide   4
glyburide   4
tolbutamide   2
pioglitazone   4
rosiglitazone   4
acarbose   4
miglitol   4
troglitazone   1
tolazamide   3
examide   1
citoglipton   1
insulin   4
glyburide-metformin   4
glipizide-metformin   2
glimepiride-pioglitazone   2
metformin-rosiglitazone   2
metformin-pioglitazone   2
change   2
diabetesMed   2


In [66]:
def change_diagnosis(value):    
    if value >= 1 and value <= 139:
        return "D1"
    elif value <= 239:
        return "D2"
    elif value <= 279:
        return "D3"
    elif value <= 289:
        return "D4"
    elif value <= 319:
        return "D5"
    elif value <= 389:
        return "D6"
    elif value <= 459:
        return "D7"
    elif value <= 519:
        return "D8"
    elif value <= 579:
        return "D9"
    elif value <= 629:
        return "D9"
    elif value <= 679:
        return "D10"
    elif value <= 709:
        return "D11"
    elif value <= 739:
        return "D12"
    elif value <= 759:
        return "D13"
    elif value <= 779:
        return "D14"
    elif value <= 799:
        return "D15"
    elif value <= 999:
        return "D16"
    elif value == 1000:
        return "D17"
    else:
        return "D0"
    

In [67]:
type(df["admission_source_id"][0])

numpy.int64

In [68]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

In [69]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

In [70]:
def change_discharge_disposition_id(value):
    if value == 1:
        return "Home_No_Treatment"
    elif value in range(2, 6) or range(15, 18) or range(22, 25) or range(27, 31) or range(9, 11):
        return "Transfer"
    elif value in (6, 8):
        return "Home_Treatment"
    elif value in (11, 19, 20, 21):
        return "Expired"
    elif value in (18, 25, 26, 0):
        return "NULL"
    else:
        return "Others"

In [71]:
diag_1_grouping_indices = df[df["diag_1"].str.startswith(('E', 'V'))].index
df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
df['diag_1'] = df['diag_1'].astype(float)
df['diag_1'] = df['diag_1'].apply(change_diagnosis)

In [72]:
diag_2_grouping_indices = df[df["diag_2"].str.startswith(('E', 'V'))].index
df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
df['diag_2'] = df['diag_2'].astype(float)
df['diag_2'] = df['diag_2'].apply(change_diagnosis)

In [73]:
diag_3_grouping_indices = df[df["diag_3"].str.startswith(('E', 'V'))].index
df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
df['diag_3'] = df['diag_3'].astype(float)
df['diag_3'] = df['diag_3'].apply(change_diagnosis)

In [74]:
df['admission_source_id'] = df['admission_source_id'].astype(int)
df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [75]:
df['admission_type_id'] = df['admission_type_id'].astype(int)
df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [76]:
df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [77]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

In [78]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

In [79]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(71233, 44)


In [80]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [82]:
knn_model = KNeighborsClassifier(n_neighbors=8)
knn_model.fit(X_train, Y_train)

Y_pred = knn_model.predict(X_test)
print(accuracy_score(Y_pred, Y_test))

0.514424089281954


In [83]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, Y_train)
Y_pred = decision_tree_model.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.4702744437425423

In [84]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, Y_train)

Y_pred = random_forest_model.predict(X_test)
print(accuracy_score(Y_pred, Y_test))

0.5750684354600969


# Testing

In [85]:
test_df = pd.read_csv("../test.csv")
test_df.drop(['payer_code', 'enc_id', 'patient_id'], axis=1, inplace=True)

In [86]:
null_value_percentages=(test_df.isna().sum()/test_df.shape[0])*100
null_value_percentages.sort_values(ascending=False)

weight                      96.898133
max_glu_serum               94.677367
A1Cresult                   83.170652
medical_specialty           49.194235
race                         2.135604
diag_3                       1.421553
diag_2                       0.373403
diag_1                       0.019653
troglitazone                 0.000000
tolbutamide                  0.000000
pioglitazone                 0.000000
rosiglitazone                0.000000
acarbose                     0.000000
miglitol                     0.000000
tolazamide                   0.000000
glipizide                    0.000000
examide                      0.000000
citoglipton                  0.000000
insulin                      0.000000
glyburide-metformin          0.000000
glipizide-metformin          0.000000
glimepiride-pioglitazone     0.000000
metformin-rosiglitazone      0.000000
metformin-pioglitazone       0.000000
change                       0.000000
glyburide                    0.000000
nateglinide 

In [87]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult'], axis=1, inplace=True)

In [88]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [89]:
diag_1_grouping_indices = test_df[test_df["diag_1"].str.startswith(('E', 'V'))].index
test_df.loc[diag_1_grouping_indices, "diag_1"] = "1000"

test_df['diag_1'] = test_df['diag_1'].astype(float)
test_df['diag_1'] = test_df['diag_1'].apply(change_diagnosis)

In [90]:
diag_2_grouping_indices = test_df[test_df["diag_2"].str.startswith(('E', 'V'))].index
test_df.loc[diag_2_grouping_indices, "diag_2"] = "1000"

test_df['diag_2'] = test_df['diag_2'].astype(float)
test_df['diag_2'] = test_df['diag_2'].apply(change_diagnosis)

In [91]:
diag_3_grouping_indices = test_df[test_df["diag_3"].str.startswith(('E', 'V'))].index
test_df.loc[diag_3_grouping_indices, "diag_3"] = "1000"

test_df['diag_3'] = test_df['diag_3'].astype(float)
test_df['diag_3'] = test_df['diag_3'].apply(change_diagnosis)

In [92]:
test_df['admission_source_id'] = test_df['admission_source_id'].astype(int)
test_df['admission_source_id'] = test_df['admission_source_id'].apply(change_admission_source_id)

In [93]:
test_df['admission_type_id'] = test_df['admission_type_id'].astype(int)
test_df['admission_type_id'] = test_df['admission_type_id'].apply(change_admission_type_id)

In [94]:
test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].astype(int)
test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [95]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [96]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [97]:
X_test_data = test_df.iloc[:, :].values

In [98]:
Y_pred = random_forest_model.predict(X_test_data)
Y_pred


array([1, 2, 2, ..., 1, 1, 2], dtype=int64)

In [99]:
submit = pd.read_csv("sample_submission.csv")
submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,1
1,394919696,2
2,164917446,2
3,178319040,1
4,253585416,2


In [100]:
submit.to_csv("submit_tmp.csv", index = False)

In [101]:
submit["readmission_id"].value_counts()

2    20991
1     9494
0       45
Name: readmission_id, dtype: int64

In [102]:
df["readmission_id"].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64