In [165]:
!pip3 install lightgbm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint



## Loading the training dataset

In [166]:
df = pd.read_csv("../train.csv")
df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [167]:
df.drop_duplicates(inplace=True)

In [168]:
df.drop(['weight', 'max_glu_serum'], axis=1, inplace=True)

In [169]:
df['count_changes'] = 0

drugs_cols = ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]

def count_changes(row):
    return sum([1 for col in drugs_cols if row[col] in ['Up', 'Down']])

# Apply the function row-wise
df['count_changes'] = df.apply(count_changes, axis=1)
df.drop(drugs_cols, axis=1, inplace=True)

In [170]:
df['count_changes'].value_counts()

0    51867
1    18385
2      905
3       75
4        4
Name: count_changes, dtype: int64

In [171]:
df.shape

(71236, 22)

In [172]:
# Dropping A1Cresult
df.drop(['A1Cresult'], axis=1, inplace=True)
df.shape

(71236, 21)

In [173]:
# outlier_removal_rows = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

# # Create a copy of the DataFrame to avoid modifying the original
# df_copy = df.copy()

# for attr in outlier_removal_rows:
#     Q1 = df_copy[attr].quantile(0.25)
#     Q3 = df_copy[attr].quantile(0.75)
#     IQR = Q3 - Q1
#     lower = Q1 - 1.5 * IQR
#     upper = Q3 + 1.5 * IQR
    
#     # Create a boolean mask to identify outliers
#     outlier_mask = (df_copy[attr] < lower) | (df_copy[attr] > upper)
    
#     # Remove the rows with outliers
#     df_copy = df_copy[~outlier_mask]
 
# print("New Shape: ", df_copy.shape)

In [174]:
# df = df_copy

In [175]:
df.iloc[1101, :]

enc_id                            111133884
patient_id                         35624655
race                        AfricanAmerican
gender                               Female
age                                 [40-50)
admission_type_id                         1
discharge_disposition_id                  3
admission_source_id                       7
time_in_hospital                         12
medical_specialty                Nephrology
num_lab_procedures                       63
num_procedures                            3
num_medications                          23
number_outpatient                         0
number_emergency                          0
number_inpatient                          6
number_diagnoses                          9
change                                   No
diabetesMed                             Yes
readmission_id                            1
count_changes                             0
Name: 1101, dtype: object

In [176]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [177]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

In [178]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

In [179]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
columns_to_encode.remove("enc_id")
columns_to_encode.remove("count_changes")
columns_to_encode.remove("patient_id")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
columns_to_encode.remove("number_diagnoses")
columns_to_encode.remove("time_in_hospital")
columns_to_encode.remove("num_lab_procedures")
columns_to_encode.remove("num_procedures")
columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
medical_specialty   69
change   2
diabetesMed   2


In [180]:
df['f_patient_id'] = df['patient_id'].copy(deep=True)

In [181]:
df['f_patient_id']

0          2488608
1         52133202
2         40945509
3         38850777
4         72738225
           ...    
71231     85063725
71232     86244345
71233      5131368
71234     85969035
71235    112239351
Name: f_patient_id, Length: 71233, dtype: int64

In [182]:
df[df['f_patient_id'] == 43140906].index

Int64Index([ 7468, 12247, 12273, 18598, 21621, 29956, 32767, 33751, 35637,
            36301, 36646, 45948, 48062, 50741, 52466, 55884, 58997, 60464,
            65103, 68773, 70842],
           dtype='int64')

In [183]:
cnt_dict = df['patient_id'].value_counts()

In [184]:
for i in df['patient_id']:
    idx = df[df['f_patient_id'] == i].index
    df.loc[idx, 'f_patient_id'] = cnt_dict[i]

In [185]:
df['f_patient_id']

0        1
1        2
2        8
3        1
4        1
        ..
71231    3
71232    1
71233    1
71234    1
71235    1
Name: f_patient_id, Length: 71233, dtype: int64

In [186]:
df.drop(['patient_id'], axis=1, inplace=True)

In [187]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

In [188]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

In [189]:
def change_discharge_disposition_id(value):
    if value == 1:
        return "Home_No_Treatment"
    elif value in range(2, 6) or range(15, 18) or range(22, 25) or range(27, 31) or range(9, 11):
        return "Transfer"
    elif value in (6, 8):
        return "Home_Treatment"
    elif value in (11, 19, 20, 21):
        return "Expired"
    elif value in (18, 25, 26, 0):
        return "NULL"
    else:
        return "Others"

In [190]:
# df['admission_source_id'] = df['admission_source_id'].astype(int)
# df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [191]:
# df['admission_type_id'] = df['admission_type_id'].astype(int)
# df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [192]:
# df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
# df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [193]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

In [194]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

In [195]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(71233, 21)


In [196]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [197]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [198]:
from lightgbm import LGBMClassifier

# Train a model using the scikit-learn API
lgb_classifier = LGBMClassifier(n_estimators=300, objective='multiclass', max_depth=-1, random_state=0, boosting_type='dart')
lgb_classifier.fit(X_train, Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 753
[LightGBM] [Info] Number of data points in the train set: 56986, number of used features: 98
[LightGBM] [Info] Start training from score -2.192777
[LightGBM] [Info] Start training from score -1.051835
[LightGBM] [Info] Start training from score -0.617859


In [199]:
from sklearn.metrics import f1_score
Y_pred = lgb_classifier.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.6949533235067031


In [200]:
# df

In [201]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(Y_pred, Y_test)

# Testing

In [202]:
test_df = pd.read_csv("../test.csv")
test_df.drop(['payer_code', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

In [203]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult'], axis=1, inplace=True)

In [204]:
test_df['count_changes'] = 0

# Apply the function row-wise
test_df['count_changes'] = test_df.apply(count_changes, axis=1)
test_df.drop(drugs_cols, axis=1, inplace=True)

In [205]:
test_df['count_changes'].value_counts()

0    22196
1     7887
2      413
3       33
4        1
Name: count_changes, dtype: int64

In [206]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [207]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [208]:
# test_df.set_index(test_df['patient_id'], inplace=True)
cnt_dict_1 = test_df['patient_id'].value_counts()

In [209]:
test_df['f_patient_id'] = test_df['patient_id'].copy(deep=True)
for i in test_df['patient_id']:
    idx = test_df[test_df['f_patient_id'] == i].index
    if cnt_dict.get(i) != None and cnt_dict[i] != 0:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i] + cnt_dict[i]
    else:
        test_df.loc[idx, 'f_patient_id'] = cnt_dict_1[i]

In [210]:
test_df.drop(['patient_id'], axis=1, inplace=True)

In [211]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')

# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [212]:
X_test_data = test_df.iloc[:, :].values

In [213]:
Y_pred = lgb_classifier.predict(X_test_data)
Y_pred


array([1, 2, 1, ..., 1, 1, 1])

In [214]:
submit = pd.read_csv("../sample_submission.csv")
submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,1
1,394919696,2
2,164917446,1
3,178319040,1
4,253585416,1


In [215]:
submit.to_csv("submit_tmp.csv", index = False)

In [216]:
submit["readmission_id"].value_counts()

2    18103
1    12144
0      283
Name: readmission_id, dtype: int64

In [217]:
df["readmission_id"].value_counts()

2    38402
1    24881
0     7950
Name: readmission_id, dtype: int64