In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

## Loading the training dataset

In [72]:
df = pd.read_csv("./train.csv")
df.drop_duplicates(inplace=True)
df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)
df.drop(['weight', 'max_glu_serum'], axis=1, inplace=True)

In [73]:
drugs_cols = ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]

def count_up(row):
    return sum([1 for col in drugs_cols if row[col] in ['Up']])

def count_down(row):
    return sum([1 for col in drugs_cols if row[col] in ['Down']])

def count_steady(row):
    return sum([1 for col in drugs_cols if row[col] in ['Steady']])

def count_no(row):
    return sum([1 for col in drugs_cols if row[col] in ['No']])

# Apply the function row-wise
df['count_up'] = df.apply(count_up, axis=1)
df['count_down'] = df.apply(count_down, axis=1)
df['count_steady'] = df.apply(count_steady, axis=1)
df['count_no'] = df.apply(count_no, axis=1)
df.drop(drugs_cols, axis=1, inplace=True)

In [74]:
def change_diagnosis(value):    
    if value >= 1 and value <= 139:
        return "D1"
    elif value <= 239:
        return "D2"
    elif value <= 279:
        return "D3"
    elif value <= 289:
        return "D4"
    elif value <= 319:
        return "D5"
    elif value <= 389:
        return "D6"
    elif value <= 459:
        return "D7"
    elif value <= 519:
        return "D8"
    elif value <= 579:
        return "D9"
    elif value <= 629:
        return "D9"
    elif value <= 679:
        return "D10"
    elif value <= 709:
        return "D11"
    elif value <= 739:
        return "D12"
    elif value <= 759:
        return "D13"
    elif value <= 779:
        return "D14"
    elif value <= 799:
        return "D15"
    elif value <= 999:
        return "D16"
    elif value == 1000:
        return "D17"
    else:
        return "D0"

In [75]:
# null_value_percentages=(df.isna().sum()/df.shape[0])*100
# null_value_percentages.sort_values()

In [76]:
df.shape

(71236, 25)

In [77]:
# Dropping A1Cresult
df.drop(['A1Cresult'], axis=1, inplace=True)
df.shape

(71236, 24)

In [78]:
# df = df_copy

In [79]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [80]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

In [81]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

In [82]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
columns_to_encode.remove("enc_id")
columns_to_encode.remove("count_up")
columns_to_encode.remove("count_down")
columns_to_encode.remove("count_steady")
columns_to_encode.remove("count_no")
columns_to_encode.remove("patient_id")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
columns_to_encode.remove("number_diagnoses")
columns_to_encode.remove("time_in_hospital")
columns_to_encode.remove("num_lab_procedures")
columns_to_encode.remove("num_procedures")
columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
medical_specialty   69
change   2
diabetesMed   2


In [83]:
df['f_patient_id'] = df['patient_id'].copy(deep=True)
cnt_dict = df['patient_id'].value_counts()
for i in df['patient_id']:
    idx = df[df['f_patient_id'] == i].index
    df.loc[idx, 'f_patient_id'] = cnt_dict[i]
df.drop(['patient_id'], axis=1, inplace=True)

In [84]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

# def change_admission_source_id(value):
#     if value == 1 or value == 7:
#         return "Cat1"
#     else:
#         return "Others"

In [85]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

# def change_admission_type_id(value):
#     if value == 1:
#         return "Emergency"
#     else:
#         return "Others"

In [86]:
def change_discharge_disposition_id(value):
    if value in (11, 19, 20, 21):
        return "Expired"
    else:
        return "Others"
    
# def change_discharge_disposition_id(value):
#     if value == 1:
#         return "Home_No_Treatment"
#     else:
#         return "Others"

In [87]:
# diag_1_grouping_indices = df[df["diag_1"].str.startswith(('E', 'V'))].index
# df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
# df['diag_1'] = df['diag_1'].astype(float)
# df['diag_1'] = df['diag_1'].apply(change_diagnosis)

# diag_2_grouping_indices = df[df["diag_2"].str.startswith(('E', 'V'))].index
# df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
# df['diag_2'] = df['diag_2'].astype(float)
# df['diag_2'] = df['diag_2'].apply(change_diagnosis)

# diag_3_grouping_indices = df[df["diag_3"].str.startswith(('E', 'V'))].index
# df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
# df['diag_3'] = df['diag_3'].astype(float)
# df['diag_3'] = df['diag_3'].apply(change_diagnosis)

In [88]:
# df['admission_source_id'] = df['admission_source_id'].astype(int)
# df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [89]:
# df['admission_type_id'] = df['admission_type_id'].astype(int)
# df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [90]:
# df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
# df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [91]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

In [92]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

In [93]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(71233, 24)


In [94]:
df['readmission_id'].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64

In [95]:
df_0 = df.copy(deep=True)
df_0['readmission_id'] = np.where(df_0['readmission_id'] == 2, 1, df_0['readmission_id']) # 0/not 0 --> 0, 1

df_1 = df.copy(deep=True)
df_1['readmission_id'] = np.where(df_1['readmission_id'] == 2, 0, df_1['readmission_id']) # not 1/1 <--- 0/1

df_2 = df.copy(deep=True)
df_2['readmission_id'] = np.where(df_2['readmission_id'] == 1, 0, df_2['readmission_id']) # not 2/2 <--- 0/1
df_2['readmission_id'] = np.where(df_2['readmission_id'] == 2, 1, df_2['readmission_id']) # not 2/2 <--- 0/1

In [96]:
df_0['readmission_id'].value_counts()

1    63283
0     7950
Name: readmission_id, dtype: int64

In [97]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

X0 = df_0.iloc[:, :-1].values
Y0 = df_0.iloc[:, -1].values

X1 = df_1.iloc[:, :-1].values
Y1 = df_1.iloc[:, -1].values

X2 = df_2.iloc[:, :-1].values
Y2 = df_2.iloc[:, -1].values

In [98]:
X_train0, X_test0, Y_train0, Y_test0 = train_test_split(X0, Y0, test_size=0.2, random_state=0, stratify=Y0)
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=0, stratify=Y1)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=0, stratify=Y2)

In [99]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [100]:
from lightgbm import LGBMClassifier

# Train a model using the scikit-learn API
lgb_classifier_0 = LGBMClassifier(n_estimators=300, max_depth=-1, random_state=0,  boosting_type='dart', class_weight={0: 30, 1: 20})
lgb_classifier_0.fit(X_train0, Y_train0)

[LightGBM] [Info] Number of positive: 50626, number of negative: 6360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 56986, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.841439 -> initscore=1.668972
[LightGBM] [Info] Start training from score 1.668972


In [101]:
# Train a model using the scikit-learn API
lgb_classifier_1 = LGBMClassifier(n_estimators=300, max_depth=-1, random_state=0, boosting_type='dart', class_weight={0: 20, 1: 40})
lgb_classifier_1.fit(X_train1, Y_train1)

[LightGBM] [Info] Number of positive: 19905, number of negative: 37081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 767
[LightGBM] [Info] Number of data points in the train set: 56986, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.517746 -> initscore=0.071013
[LightGBM] [Info] Start training from score 0.071013


In [102]:
# Train a model using the scikit-learn API
lgb_classifier_2 = LGBMClassifier(n_estimators=300, max_depth=-1, random_state=0, boosting_type='dart', class_weight={0: 25, 1: 30})
lgb_classifier_2.fit(X_train2, Y_train2)

[LightGBM] [Info] Number of positive: 30721, number of negative: 26265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 767
[LightGBM] [Info] Number of data points in the train set: 56986, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.583955 -> initscore=0.339031
[LightGBM] [Info] Start training from score 0.339031


In [103]:
from sklearn.metrics import f1_score

In [104]:
Y_pred0 = lgb_classifier_0.predict(X_test0)
print(accuracy_score(Y_test0, Y_pred0), f1_score(Y_test0, Y_pred0))

0.8889590791043729 0.9408686551543693


In [105]:
Y_pred1 = lgb_classifier_1.predict(X_test1)
print(accuracy_score(Y_test1, Y_pred1), f1_score(Y_test1, Y_pred1))

0.7066750894925248 0.6252354048964218


In [106]:
Y_pred2 = lgb_classifier_2.predict(X_test2)
print(accuracy_score(Y_test2, Y_pred2), f1_score(Y_test2, Y_pred2))

0.7744086474345476 0.8109189316390164


In [107]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test0, Y_pred0)

array([[   79,  1511],
       [   71, 12586]])

In [108]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test1, Y_pred1)

array([[6582, 2689],
       [1490, 3486]])

In [109]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test2, Y_pred2)

array([[4141, 2425],
       [ 789, 6892]])

In [110]:
pr_0 = lgb_classifier_0.predict_proba(X_test0)
pr_1 = lgb_classifier_1.predict_proba(X_test1)
pr_2 = lgb_classifier_2.predict_proba(X_test2)

In [111]:
pr_0

array([[0.0558529 , 0.9441471 ],
       [0.58913401, 0.41086599],
       [0.06775426, 0.93224574],
       ...,
       [0.10112597, 0.89887403],
       [0.46048961, 0.53951039],
       [0.07178875, 0.92821125]])

In [112]:
pr_1

array([[0.62957205, 0.37042795],
       [0.9940533 , 0.0059467 ],
       [0.58535752, 0.41464248],
       ...,
       [0.79191073, 0.20808927],
       [0.57405652, 0.42594348],
       [0.78302012, 0.21697988]])

In [113]:
pr_2

array([[0.19780621, 0.80219379],
       [0.2753954 , 0.7246046 ],
       [0.1564956 , 0.8435044 ],
       ...,
       [0.27506597, 0.72493403],
       [0.39919268, 0.60080732],
       [0.2078547 , 0.7921453 ]])

In [114]:
Y_pred = np.copy(Y_test)
for i in range(len(pr_0)):
    a = pr_0[i][0]
    b = pr_1[i][1]
    c = pr_2[i][1]
    # b = 1 - a - c
    if a == max(a, b, c):
        Y_pred[i] = 0
    elif b == max(a, b, c):
        Y_pred[i] = 1
    else:
        Y_pred[i] = 2

In [115]:
print(accuracy_score(Y_test, Y_pred), f1_score(Y_test, Y_pred, average='weighted'))
confusion_matrix(Y_test, Y_pred)

0.4555344984909104 0.43531421024200867


array([[  51,  573,  966],
       [ 117, 1814, 3045],
       [ 186, 2870, 4625]])

# Testing

In [116]:
test_df = pd.read_csv("./test.csv")
test_df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [117]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult'], axis=1, inplace=True)

In [118]:
# Apply the function row-wise
test_df['count_up'] = test_df.apply(count_up, axis=1)
test_df['count_down'] = test_df.apply(count_down, axis=1)
test_df['count_steady'] = test_df.apply(count_steady, axis=1)
test_df['count_no'] = test_df.apply(count_no, axis=1)
test_df.drop(drugs_cols, axis=1, inplace=True)

In [119]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [120]:
# test_df['admission_source_id'] = test_df['admission_source_id'].astype(int)
# test_df['admission_source_id'] = test_df['admission_source_id'].apply(change_admission_source_id)

In [121]:
# test_df['admission_type_id'] = test_df['admission_type_id'].astype(int)
# test_df['admission_type_id'] = test_df['admission_type_id'].apply(change_admission_type_id)

In [122]:
# test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].astype(int)
# test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [123]:
# diag_1_grouping_indices = test_df[test_df["diag_1"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
# test_df['diag_1'] = test_df['diag_1'].astype(float)
# test_df['diag_1'] = test_df['diag_1'].apply(change_diagnosis)

# diag_2_grouping_indices = test_df[test_df["diag_2"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
# test_df['diag_2'] = test_df['diag_2'].astype(float)
# test_df['diag_2'] = test_df['diag_2'].apply(change_diagnosis)

# diag_3_grouping_indices = test_df[test_df["diag_3"].str.startswith(('E', 'V'))].index
# test_df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
# test_df['diag_3'] = test_df['diag_3'].astype(float)
# test_df['diag_3'] = test_df['diag_3'].apply(change_diagnosis)

In [124]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [125]:
cnt_dict_1 = test_df['patient_id'].value_counts()

In [126]:
test_df['f_patient_id'] = test_df['patient_id'].copy(deep=True)
for i in test_df['patient_id']:
    idx = test_df[test_df['f_patient_id'] == i].index
    if cnt_dict.get(i) != None and cnt_dict[i] != 0:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i] + cnt_dict[i]
    else:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i]

In [127]:
test_df.drop(['patient_id'], axis=1, inplace=True)

In [128]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')

# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [129]:
X_test_data = test_df.iloc[:, :].values

In [130]:
Y_pred0 = lgb_classifier_0.predict(X_test_data)
Y_pred0


array([1, 1, 1, ..., 1, 1, 1])

In [131]:
Y_pred1 = lgb_classifier_1.predict(X_test_data)
Y_pred1


array([1, 0, 1, ..., 1, 1, 1])

In [132]:
Y_pred2 = lgb_classifier_2.predict(X_test_data)
Y_pred2


array([0, 1, 0, ..., 0, 0, 0])

In [133]:
pr_0 = lgb_classifier_0.predict_proba(X_test_data)
pr_1 = lgb_classifier_1.predict_proba(X_test_data)
pr_2 = lgb_classifier_2.predict_proba(X_test_data)

In [134]:
submit = pd.read_csv("./sample_submission.csv")
# submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,2.0
1,394919696,1.0
2,164917446,0.0
3,178319040,2.0
4,253585416,0.0


In [135]:
Y_pred = submit['readmission_id']
for i in range(len(pr_0)):
    a = pr_0[i][0]
    b = pr_1[i][1]
    c = pr_2[i][1]
    # b = 1 - a - c
    if a == max(a, b, c):
        Y_pred[i] = 0
    elif b == max(a, b, c):
        Y_pred[i] = 1
    else:
        Y_pred[i] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_pred[i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_pred[i] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_pred[i] = 0


In [136]:
submit['readmission_id'] = Y_pred

In [137]:
submit.to_csv("submit_tmp.csv", index = False)

In [138]:
submit["readmission_id"].value_counts()

2.0    16868
1.0    13527
0.0      135
Name: readmission_id, dtype: int64

In [139]:
df["readmission_id"].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64