In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

## Loading the training dataset

In [60]:
df = pd.read_csv("../../train.csv")
df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [61]:
df.drop_duplicates(inplace=True)

In [62]:
null_value_percentages=(df.isna().sum()/df.shape[0])*100
null_value_percentages.sort_values()

enc_id                       0.000000
acetohexamide                0.000000
glipizide                    0.000000
glyburide                    0.000000
tolbutamide                  0.000000
pioglitazone                 0.000000
rosiglitazone                0.000000
acarbose                     0.000000
miglitol                     0.000000
troglitazone                 0.000000
tolazamide                   0.000000
examide                      0.000000
citoglipton                  0.000000
insulin                      0.000000
glyburide-metformin          0.000000
glipizide-metformin          0.000000
glimepiride-pioglitazone     0.000000
metformin-rosiglitazone      0.000000
metformin-pioglitazone       0.000000
change                       0.000000
glimepiride                  0.000000
chlorpropamide               0.000000
nateglinide                  0.000000
repaglinide                  0.000000
patient_id                   0.000000
gender                       0.000000
age         

In [63]:
df.drop(['weight', 'max_glu_serum'], axis=1, inplace=True)

In [64]:
df['count_changes'] = 0

drugs_cols = ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]

def count_changes(row):
    return sum([1 for col in drugs_cols if row[col] in ['Up', 'Down']])

# Apply the function row-wise
df['count_changes'] = df.apply(count_changes, axis=1)
df.drop(drugs_cols, axis=1, inplace=True)

In [65]:
df['count_changes'].value_counts()

0    51867
1    18385
2      905
3       75
4        4
Name: count_changes, dtype: int64

In [66]:
df.shape

(71236, 22)

In [67]:
# Dropping A1Cresult
df.drop(['A1Cresult'], axis=1, inplace=True)
df.shape

(71236, 21)

In [68]:
outlier_removal_rows = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

# Create a copy of the DataFrame to avoid modifying the original
df_copy = df.copy()

for attr in outlier_removal_rows:
    Q1 = df_copy[attr].quantile(0.25)
    Q3 = df_copy[attr].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    # Create a boolean mask to identify outliers
    outlier_mask = (df_copy[attr] < lower) | (df_copy[attr] > upper)
    
    # Remove the rows with outliers
    df_copy = df_copy[~outlier_mask]
 
print("New Shape: ", df_copy.shape)

New Shape:  (64467, 21)


In [69]:
# df = df_copy

In [70]:
df.iloc[1101, :]

enc_id                            111133884
patient_id                         35624655
race                        AfricanAmerican
gender                               Female
age                                 [40-50)
admission_type_id                         1
discharge_disposition_id                  3
admission_source_id                       7
time_in_hospital                         12
medical_specialty                Nephrology
num_lab_procedures                       63
num_procedures                            3
num_medications                          23
number_outpatient                         0
number_emergency                          0
number_inpatient                          6
number_diagnoses                          9
change                                   No
diabetesMed                             Yes
readmission_id                            1
count_changes                             0
Name: 1101, dtype: object

In [71]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [72]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

In [73]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

In [74]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
# columns_to_encode.remove("time_in_hospital")
# columns_to_encode.remove("num_lab_procedures")
# columns_to_encode.remove("num_procedures")
# columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")
columns_to_encode.remove("enc_id")
columns_to_encode.remove("patient_id")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
# columns_to_encode.remove("number_diagnoses")
columns_to_encode.remove("count_changes")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
time_in_hospital   14
medical_specialty   69
num_lab_procedures   116
num_procedures   7
num_medications   74
number_diagnoses   16
change   2
diabetesMed   2


In [75]:
def change_diagnosis(value):    
    if value >= 1 and value <= 139:
        return "D1"
    elif value <= 239:
        return "D2"
    elif value <= 279:
        return "D3"
    elif value <= 289:
        return "D4"
    elif value <= 319:
        return "D5"
    elif value <= 389:
        return "D6"
    elif value <= 459:
        return "D7"
    elif value <= 519:
        return "D8"
    elif value <= 579:
        return "D9"
    elif value <= 629:
        return "D9"
    elif value <= 679:
        return "D10"
    elif value <= 709:
        return "D11"
    elif value <= 739:
        return "D12"
    elif value <= 759:
        return "D13"
    elif value <= 779:
        return "D14"
    elif value <= 799:
        return "D15"
    elif value <= 999:
        return "D16"
    elif value == 1000:
        return "D17"
    else:
        return "D0"
    

In [76]:
type(df["admission_source_id"][0])

numpy.int64

In [77]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

In [78]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

In [79]:
def change_discharge_disposition_id(value):
    if value == 1:
        return "Home_No_Treatment"
    elif value in range(2, 6) or range(15, 18) or range(22, 25) or range(27, 31) or range(9, 11):
        return "Transfer"
    elif value in (6, 8):
        return "Home_Treatment"
    elif value in (11, 19, 20, 21):
        return "Expired"
    elif value in (18, 25, 26, 0):
        return "NULL"
    else:
        return "Others"

In [80]:
# diag_1_grouping_indices = df[df["diag_1"].str.startswith(('E', 'V'))].index
# df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
# df['diag_1'] = df['diag_1'].astype(float)
# df['diag_1'] = df['diag_1'].apply(change_diagnosis)

In [81]:
# diag_2_grouping_indices = df[df["diag_2"].str.startswith(('E', 'V'))].index
# df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
# df['diag_2'] = df['diag_2'].astype(float)
# df['diag_2'] = df['diag_2'].apply(change_diagnosis)

In [82]:
# diag_3_grouping_indices = df[df["diag_3"].str.startswith(('E', 'V'))].index
# df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
# df['diag_3'] = df['diag_3'].astype(float)
# df['diag_3'] = df['diag_3'].apply(change_diagnosis)

In [83]:
# df['admission_source_id'] = df['admission_source_id'].astype(int)
# df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [84]:
# df['admission_type_id'] = df['admission_type_id'].astype(int)
# df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [85]:
# df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
# df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [86]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

In [87]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

In [88]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(71233, 21)


In [89]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [90]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [91]:
from sklearn.metrics import f1_score
# random_forest_model = RandomForestClassifier(max_leaf_nodes=900, random_state=0)
# random_forest_model.fit(X_train, Y_train)

# Y_pred = random_forest_model.predict(X_test)
# print(accuracy_score(Y_pred, Y_test))

In [92]:
import xgboost as xgb

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, Y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, Y_test, enable_categorical=True)

In [93]:
import xgboost as xgb

# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='weighted:logistic', tree_method='hist', eta=0.1, max_depth=3)
xgb_classifier.fit(X_train, Y_train)

# Convert the model to a native API model
model = xgb_classifier.get_booster()

In [94]:
from sklearn.metrics import f1_score
Y_pred = xgb_classifier.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.5996350108794835


In [95]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(Y_pred, Y_test)

# Testing

In [96]:
test_df = pd.read_csv("../../test.csv")
test_df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [97]:
null_value_percentages=(test_df.isna().sum()/test_df.shape[0])*100
null_value_percentages.sort_values(ascending=False)

weight                      96.898133
max_glu_serum               94.677367
A1Cresult                   83.170652
medical_specialty           49.194235
race                         2.135604
enc_id                       0.000000
tolazamide                   0.000000
glyburide                    0.000000
tolbutamide                  0.000000
pioglitazone                 0.000000
rosiglitazone                0.000000
acarbose                     0.000000
miglitol                     0.000000
troglitazone                 0.000000
citoglipton                  0.000000
examide                      0.000000
acetohexamide                0.000000
insulin                      0.000000
glyburide-metformin          0.000000
glipizide-metformin          0.000000
glimepiride-pioglitazone     0.000000
metformin-rosiglitazone      0.000000
metformin-pioglitazone       0.000000
change                       0.000000
glipizide                    0.000000
nateglinide                  0.000000
glimepiride 

In [98]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult'], axis=1, inplace=True)

In [99]:
test_df['count_changes'] = 0

# Apply the function row-wise
test_df['count_changes'] = test_df.apply(count_changes, axis=1)
test_df.drop(drugs_cols, axis=1, inplace=True)

In [100]:
test_df['count_changes'].value_counts()

0    22196
1     7887
2      413
3       33
4        1
Name: count_changes, dtype: int64

In [101]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [102]:
# diag_1_grouping_indices = test_df[test_df["diag_1"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_1_grouping_indices, "diag_1"] = "1000"

# test_df['diag_1'] = test_df['diag_1'].astype(float)
# test_df['diag_1'] = test_df['diag_1'].apply(change_diagnosis)

In [103]:
# diag_2_grouping_indices = test_df[test_df["diag_2"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_2_grouping_indices, "diag_2"] = "1000"

# test_df['diag_2'] = test_df['diag_2'].astype(float)
# test_df['diag_2'] = test_df['diag_2'].apply(change_diagnosis)

In [104]:
# diag_3_grouping_indices = test_df[test_df["diag_3"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_3_grouping_indices, "diag_3"] = "1000"

# test_df['diag_3'] = test_df['diag_3'].astype(float)
# test_df['diag_3'] = test_df['diag_3'].apply(change_diagnosis)

In [105]:
# test_df['admission_source_id'] = test_df['admission_source_id'].astype(int)
# test_df['admission_source_id'] = test_df['admission_source_id'].apply(change_admission_source_id)

In [106]:
# test_df['admission_type_id'] = test_df['admission_type_id'].astype(int)
# test_df['admission_type_id'] = test_df['admission_type_id'].apply(change_admission_type_id)

In [107]:
# test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].astype(int)
# test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [108]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [109]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')

# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [110]:
X_test_data = test_df.iloc[:, :].values

In [111]:
Y_pred = xgb_classifier.predict(X_test_data)
Y_pred


array([1, 2, 2, ..., 1, 1, 2], dtype=int64)

In [112]:
submit = pd.read_csv("../sample_submission.csv")
submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,1
1,394919696,2
2,164917446,2
3,178319040,1
4,253585416,1


In [113]:
submit.to_csv("submit_tmp.csv", index = False)

In [114]:
submit["readmission_id"].value_counts()

2    22375
1     8040
0      115
Name: readmission_id, dtype: int64

In [115]:
df["readmission_id"].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64