# Canadian Hospital Re-admittance Challenge

- Vidhish Trivedi (IMT2021055)
- Barath S Narayan (IMT2021524)
- Vikas Kalyanapuram (IMT2021040)

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

## Loading the Training Dataset

In [2]:
df = pd.read_csv("./train.csv")

In [3]:
df.shape

(71236, 50)

In [4]:
null_value_percentages=(df.isna().sum()/df.shape[0]).sort_values()*100

In [5]:
df.drop_duplicates(inplace=True)

Dropping columns with large number of null values, or columns which have no significant effect on the validation score.

In [6]:
df.drop(['weight', 'max_glu_serum', 'payer_code', 'diag_1', 'diag_2', 'diag_3', 'A1Cresult'], axis=1, inplace=True)

In [7]:
df.shape

(71236, 43)

Replacing the drug dosage columns with the number of `Up`, `Down`, `No`, and `Steady` values for each encounter (row)

In [8]:
drugs_cols = ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]

def count_up(row):
    return sum([1 for col in drugs_cols if row[col] in ['Up']])

def count_down(row):
    return sum([1 for col in drugs_cols if row[col] in ['Down']])

def count_steady(row):
    return sum([1 for col in drugs_cols if row[col] in ['Steady']])

def count_no(row):
    return sum([1 for col in drugs_cols if row[col] in ['No']])

# Apply the function row-wise
df['count_up'] = df.apply(count_up, axis=1)
df['count_down'] = df.apply(count_down, axis=1)
df['count_steady'] = df.apply(count_steady, axis=1)
df['count_no'] = df.apply(count_no, axis=1)
df.drop(drugs_cols, axis=1, inplace=True)

In [9]:
df.shape

(71236, 24)

### Outlier Detection and Removal

In [10]:
# outlier_removal_rows = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

# # Create a copy of the DataFrame to avoid modifying the original
# df_copy = df.copy()

# for attr in outlier_removal_rows:
#     Q1 = df_copy[attr].quantile(0.25)
#     Q3 = df_copy[attr].quantile(0.75)
#     IQR = Q3 - Q1
#     lower = Q1 - 1.5 * IQR
#     upper = Q3 + 1.5 * IQR
    
#     # Create a boolean mask to identify outliers
#     outlier_mask = (df_copy[attr] < lower) | (df_copy[attr] > upper)
    
#     # Remove the rows with outliers
#     df_copy = df_copy[~outlier_mask]
 
# print("New Shape: ", df_copy.shape)

In [11]:
# df = df_copy

In [12]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

Imputing the categorical columns with a new category, denoted by `"0"`.

In [13]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [14]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

### One-Hot Encoding
Finding the categorical columns to be one-hot encoded.

In [15]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
columns_to_encode.remove("enc_id")
columns_to_encode.remove("count_up")
columns_to_encode.remove("count_down")
columns_to_encode.remove("count_steady")
columns_to_encode.remove("count_no")
columns_to_encode.remove("patient_id")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
columns_to_encode.remove("number_diagnoses")
columns_to_encode.remove("time_in_hospital")
columns_to_encode.remove("num_lab_procedures")
columns_to_encode.remove("num_procedures")
columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
medical_specialty   69
change   2
diabetesMed   2


Adding a new column to count the number of encounters for each patient id

In [16]:
df['f_patient_id'] = df['patient_id'].copy(deep=True)

In [19]:
cnt_dict = df['patient_id'].value_counts()

Assigning the frequency values in the new column

In [20]:
for i in df['patient_id']:
    idx = df[df['f_patient_id'] == i].index
    df.loc[idx, 'f_patient_id'] = cnt_dict[i]

In [22]:
df.drop(['patient_id'], axis=1, inplace=True)

### Grouping Functions for diag_1, diag_2, diag_3, admission_type_id, admission_source_id, discharge_disposition_id

In [23]:
def change_diagnosis(value):    
    if value >= 1 and value <= 139:
        return "D1"
    elif value <= 239:
        return "D2"
    elif value <= 279:
        return "D3"
    elif value <= 289:
        return "D4"
    elif value <= 319:
        return "D5"
    elif value <= 389:
        return "D6"
    elif value <= 459:
        return "D7"
    elif value <= 519:
        return "D8"
    elif value <= 579:
        return "D9"
    elif value <= 629:
        return "D9"
    elif value <= 679:
        return "D10"
    elif value <= 709:
        return "D11"
    elif value <= 739:
        return "D12"
    elif value <= 759:
        return "D13"
    elif value <= 779:
        return "D14"
    elif value <= 799:
        return "D15"
    elif value <= 999:
        return "D16"
    elif value == 1000:
        return "D17"
    else:
        return "D0"

In [24]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

In [25]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

In [26]:
def change_discharge_disposition_id(value):
    if value == 1:
        return "Home_No_Treatment"
    elif value in range(2, 6) or range(15, 18) or range(22, 25) or range(27, 31) or range(9, 11):
        return "Transfer"
    elif value in (6, 8):
        return "Home_Treatment"
    elif value in (11, 19, 20, 21):
        return "Expired"
    elif value in (18, 25, 26, 0):
        return "NULL"
    else:
        return "Others"

In [27]:
# diag_1_grouping_indices = df[df["diag_1"].str.startswith(('E', 'V'))].index
# df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
# df['diag_1'] = df['diag_1'].astype(float)
# df['diag_1'] = df['diag_1'].apply(change_diagnosis)

In [28]:
# diag_2_grouping_indices = df[df["diag_2"].str.startswith(('E', 'V'))].index
# df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
# df['diag_2'] = df['diag_2'].astype(float)
# df['diag_2'] = df['diag_2'].apply(change_diagnosis)

In [29]:
# diag_3_grouping_indices = df[df["diag_3"].str.startswith(('E', 'V'))].index
# df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
# df['diag_3'] = df['diag_3'].astype(float)
# df['diag_3'] = df['diag_3'].apply(change_diagnosis)

In [30]:
# df['admission_source_id'] = df['admission_source_id'].astype(int)
# df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [31]:
# df['admission_type_id'] = df['admission_type_id'].astype(int)
# df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [32]:
# df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
# df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

Label encoding the age field

In [33]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

Trying out an average-based approach for converting age categories

In [None]:
# def change_age(value):
#     if(value == '[0-10)'):
#         return 5
#     elif(value == '[10-20)'):
#         return 15
#     elif(value == '[20-30)'):
#         return 25
#     elif(value == '[30-40)'):
#         return 35
#     elif(value == '[40-50)'):
#         return 45
#     elif(value == '[50-60)'):
#         return 55
#     elif(value == '[60-70)'):
#         return 65
#     elif(value == '[70-80)'):
#         return 75
#     elif(value == '[80-90)'):
#         return 85
#     elif(value == '[90-100)'):
#         return 95

# df['age'] = df['age'].apply(change_age)

In [34]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

Applying One-Hot Encoding on columns_to_encode

In [35]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(71233, 24)


### Train-Test Split

In [36]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

### Training Various Models
Trying out KNN classifier, Decision Tree Classifier, Random Forest Classifier, XGBoost Classifier, LGBM Classifier, CatBoost Classifier.

In [38]:
# knn_model = KNeighborsClassifier(n_neighbors=8)
# knn_model.fit(X_train, Y_train)

# Y_pred = knn_model.predict(X_test)
# print(accuracy_score(Y_pred, Y_test))

In [39]:
# decision_tree_model = DecisionTreeClassifier(random_state=42)
# decision_tree_model.fit(X_train, Y_train)
# Y_pred = decision_tree_model.predict(X_test)
# accuracy_score(Y_test, Y_pred)

In [40]:
# a = 0.000991
# random_forest_model = RandomForestClassifier(max_leaf_nodes = int(a * 1e6), random_state=0)
# random_forest_model.fit(X_train, Y_train)

# Y_pred = random_forest_model.predict(X_test)
# print(accuracy_score(Y_pred, Y_test))

Randomised Cross Validation Search

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# # Create the random grid
# random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
#                 'max_features': ['log2', 'sqrt', 'none'],
#                 'max_depth': [int(x) for x in range(1, 13)],
#                 'min_samples_split': [x for x in range(2, 100, 5)],
#                 'min_samples_leaf': [x for x in range(3, 15)],
#                 'bootstrap': [True, False],
#                 'criterion': ['gini', 'entropy', 'log_loss'],
#                 'oob_score': [True, False],
#                 'class_weight': ['balanced', 'balanced_subsample']}

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1, scoring='accuracy')
# # Fit the random search model
# rf_random.fit(X_train, Y_train)
# print(rf_random.best_params_)

Grid Search Cross Validation

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv=3, n_jobs=-1, verbose = 2)

# # # Fit the grid search to the data
# # grid_search.fit(X_train, Y_train)
# print(grid_search.best_params_)
# best_grid = grid_search.best_estimator_

In [41]:
# import xgboost as xgb

# # Train a model using the scikit-learn API
# xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='weighted:logistic', tree_method='hist', eta=0.1, max_depth=3)
# xgb_classifier.fit(X_train, Y_train)

# # Convert the model to a native API model
# model = xgb_classifier.get_booster()

In [None]:
# from catboost import CatBoostClassifier
# model = CatBoostClassifier(
#     iterations=5,
#     learning_rate=0.1,
#     # loss_function='CrossEntropy'
# )
# model.fit(
#     X_train, Y_train,
#     cat_features=cat_features, # Need to create additional list
#     eval_set=(X_test, Y_test),
#     verbose=False
# )
# print('Model is fitted: ' + str(model.is_fitted()))
# print('Model params:')
# print(model.get_params())

In [42]:
from lightgbm import LGBMClassifier

# Train a model using the scikit-learn API
lgb_classifier = LGBMClassifier(n_estimators=300, objective='multiclass', max_depth=-1, random_state=0, boosting_type='dart')
lgb_classifier.fit(X_train, Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 771
[LightGBM] [Info] Number of data points in the train set: 56986, number of used features: 101
[LightGBM] [Info] Start training from score -2.192777
[LightGBM] [Info] Start training from score -1.051835
[LightGBM] [Info] Start training from score -0.617859


### Validation

In [43]:
from sklearn.metrics import f1_score
Y_pred = lgb_classifier.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.6948129430757353


In [44]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(Y_pred, Y_test)

# Testing

In [45]:
test_df = pd.read_csv("./test.csv")

In [46]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult', 'payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [47]:
# Apply the function row-wise
test_df['count_up'] = test_df.apply(count_up, axis=1)
test_df['count_down'] = test_df.apply(count_down, axis=1)
test_df['count_steady'] = test_df.apply(count_steady, axis=1)
test_df['count_no'] = test_df.apply(count_no, axis=1)
test_df.drop(drugs_cols, axis=1, inplace=True)

In [48]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [49]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [50]:
# test_df.set_index(test_df['patient_id'], inplace=True)
cnt_dict_1 = test_df['patient_id'].value_counts()

### Here, to calculate the final values in the f_patient_id column, we add the encounters for a given patient_id from the train and the test data.
This is intuitve, since a patient who, according to the train data had an encounter count of x, should be assigned an encounter count of x + n (where n is the encounter count of the same patient according to the test data) and not just n.

In [51]:
test_df['f_patient_id'] = test_df['patient_id'].copy(deep=True)
for i in test_df['patient_id']:
    idx = test_df[test_df['f_patient_id'] == i].index
    if cnt_dict.get(i) != None and cnt_dict[i] != 0:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i] + cnt_dict[i]
    else:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i]

In [52]:
test_df.drop(['patient_id'], axis=1, inplace=True)

In [53]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')

# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [54]:
X_test_data = test_df.iloc[:, :].values

In [55]:
Y_pred = lgb_classifier.predict(X_test_data)
Y_pred


array([1, 2, 1, ..., 1, 1, 1], dtype=int64)

In [56]:
submit = pd.read_csv("./sample_submission.csv")
submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,1
1,394919696,2
2,164917446,1
3,178319040,1
4,253585416,1


In [57]:
submit.to_csv("submit_tmp.csv", index = False)

In [58]:
submit["readmission_id"].value_counts()

2    18125
1    12120
0      285
Name: readmission_id, dtype: int64

In [59]:
df["readmission_id"].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64