In [274]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

In [275]:
class outlierremoval:
	def __init__(self, col):
		q1 = col.quantile(0.25)
		q3 = col.quantile(0.75)
		inter_quartile_range=q3-q1
		self.upper_whisker=q3+inter_quartile_range*1.5
		self.lower_whisker=q1-inter_quartile_range*1.5
  
	def remove(self, row):
		if(row<=self.upper_whisker and row>=self.lower_whisker):
			return row
		elif row < self.lower_whisker:
			return None

## Loading the training dataset

In [276]:
df = pd.read_csv("../../train.csv")
df.drop(['payer_code', 'enc_id', 'patient_id'], axis=1, inplace=True)

In [277]:
df.drop_duplicates(inplace=True)

In [278]:
null_value_percentages=(df.isna().sum()/df.shape[0])*100

In [279]:
df.drop(['weight', 'max_glu_serum'], axis=1, inplace=True)

In [280]:
df['count_changes'] = 0

drugs_cols = ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]

def count_changes(row):
    return sum([1 for col in drugs_cols if row[col] in ['Up', 'Down']])

# Apply the function row-wise
df['count_changes'] = df.apply(count_changes, axis=1)
df.drop(drugs_cols, axis=1, inplace=True)

In [281]:
df['count_changes'].value_counts()

0    51867
1    18385
2      905
3       75
4        4
Name: count_changes, dtype: int64

In [282]:
df.shape

(71236, 23)

In [283]:
# Dropping A1Cresult
df.drop(['A1Cresult'], axis=1, inplace=True)
df.shape

(71236, 22)

In [284]:
outlier_removal_rows = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

# Create a copy of the DataFrame to avoid modifying the original
df_copy = df.copy()

for attr in outlier_removal_rows:
    Q1 = df_copy[attr].quantile(0.25)
    Q3 = df_copy[attr].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    # Create a boolean mask to identify outliers
    outlier_mask = (df_copy[attr] < lower) | (df_copy[attr] > upper)
    
    # Remove the rows with outliers
    df_copy = df_copy[~outlier_mask]
 
print("New Shape: ", df_copy.shape)

New Shape:  (64467, 22)


In [285]:
df = df_copy

In [286]:
df.iloc[1101, :]

race                        Caucasian
gender                           Male
age                           [80-90)
admission_type_id                   1
discharge_disposition_id            1
admission_source_id                 7
time_in_hospital                    3
medical_specialty                 NaN
num_lab_procedures                 12
num_procedures                      0
num_medications                    13
number_outpatient                   0
number_emergency                    2
number_inpatient                    3
diag_1                            428
diag_2                            427
diag_3                            401
number_diagnoses                    8
change                             Ch
diabetesMed                       Yes
readmission_id                      1
count_changes                       1
Name: 1211, dtype: object

In [287]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0)
str_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="0")
for c in df.columns:
    if df[c].dtype == np.int64 or df[c].dtype == np.float64:
        imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(df[c].values.reshape(-1, 1))
        df[c] = str_imputer.transform(df[c].values.reshape(-1, 1)).reshape(-1,)

In [288]:
# plt.figure(figsize=(10,10))
# dataplot = sns.heatmap(df.corr(numeric_only=True), annot=True)
# plt.show()

In [289]:
# Drop row with invalid gender. Dropping 1 row.
i = df[((df.gender == 'Unknown/Invalid'))].index
df.drop(i, inplace=True)

In [290]:
columns_to_encode = df.select_dtypes(exclude=['float']).columns.tolist()

columns_to_encode.remove("age")
columns_to_encode.remove("readmission_id")
# columns_to_encode.remove("time_in_hospital")
# columns_to_encode.remove("num_lab_procedures")
# columns_to_encode.remove("num_procedures")
# columns_to_encode.remove("num_medications")
columns_to_encode.remove("number_outpatient")
columns_to_encode.remove("number_emergency")
columns_to_encode.remove("number_inpatient")
# columns_to_encode.remove("number_diagnoses")
columns_to_encode.remove("count_changes")

for i in columns_to_encode:
    print(i," ",df[i].unique().size)

# diag_1, diag_2 and diag_3 have many unique values, hence we are grouping

race   6
gender   2
admission_type_id   8
discharge_disposition_id   26
admission_source_id   17
time_in_hospital   12
medical_specialty   68
num_lab_procedures   96
num_procedures   6
num_medications   32
diag_1   678
diag_2   685
diag_3   738
number_diagnoses   12
change   2
diabetesMed   2


In [291]:
def change_diagnosis(value):    
    if value >= 1 and value <= 139:
        return "D1"
    elif value <= 239:
        return "D2"
    elif value <= 279:
        return "D3"
    elif value <= 289:
        return "D4"
    elif value <= 319:
        return "D5"
    elif value <= 389:
        return "D6"
    elif value <= 459:
        return "D7"
    elif value <= 519:
        return "D8"
    elif value <= 579:
        return "D9"
    elif value <= 629:
        return "D9"
    elif value <= 679:
        return "D10"
    elif value <= 709:
        return "D11"
    elif value <= 739:
        return "D12"
    elif value <= 759:
        return "D13"
    elif value <= 779:
        return "D14"
    elif value <= 799:
        return "D15"
    elif value <= 999:
        return "D16"
    elif value == 1000:
        return "D17"
    else:
        return "D0"
    

In [292]:
type(df["admission_source_id"][0])

numpy.int64

In [293]:
def change_admission_source_id(value):
    if value in (1, 2, 3):
        return "Referral"
    elif value in (4, 5, 6, 10, 18, 22, 25, 26):
        return "Transfer"
    elif value in (11, 12, 13, 14, 23, 24):
        return "Pregnancy"
    elif value in (9, 15, 17, 20, 21, 0):
        return "NULL"
    else:
        return "Others"  # readmission (19), emergency (7), court/law enf (8)

In [294]:
def change_admission_type_id(value):
    if value in (1, 2, 7):
        return "Emergency"
    elif value == 3:
        return "Elective"
    elif value == 4:
        return "Newborn"
    elif value in (0, 5, 6, 8):
        return "NULL"
    else:
        return "Others"

In [295]:
def change_discharge_disposition_id(value):
    if value == 1:
        return "Home_No_Treatment"
    elif value in range(2, 6) or range(15, 18) or range(22, 25) or range(27, 31) or range(9, 11):
        return "Transfer"
    elif value in (6, 8):
        return "Home_Treatment"
    elif value in (11, 19, 20, 21):
        return "Expired"
    elif value in (18, 25, 26, 0):
        return "NULL"
    else:
        return "Others"

In [296]:
diag_1_grouping_indices = df[df["diag_1"].str.startswith(('E', 'V'))].index
df.loc[diag_1_grouping_indices, "diag_1"] = "1000"
df['diag_1'] = df['diag_1'].astype(float)
df['diag_1'] = df['diag_1'].apply(change_diagnosis)

In [297]:
diag_2_grouping_indices = df[df["diag_2"].str.startswith(('E', 'V'))].index
df.loc[diag_2_grouping_indices, "diag_2"] = "1000"
df['diag_2'] = df['diag_2'].astype(float)
df['diag_2'] = df['diag_2'].apply(change_diagnosis)

In [298]:
diag_3_grouping_indices = df[df["diag_3"].str.startswith(('E', 'V'))].index
df.loc[diag_3_grouping_indices, "diag_3"] = "1000"
df['diag_3'] = df['diag_3'].astype(float)
df['diag_3'] = df['diag_3'].apply(change_diagnosis)

In [299]:
df['admission_source_id'] = df['admission_source_id'].astype(int)
df['admission_source_id'] = df['admission_source_id'].apply(change_admission_source_id)

In [300]:
df['admission_type_id'] = df['admission_type_id'].astype(int)
df['admission_type_id'] = df['admission_type_id'].apply(change_admission_type_id)

In [301]:
df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(int)
df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [302]:
label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

In [303]:
df.set_index(pd.Index(range(0, df.shape[0])), inplace=True)

In [304]:
print(df.shape)
# Converting type of columns to category 
for c in columns_to_encode:
    df[c] = df[c].astype('category')
  
  
# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    df[c + "_new"] = df[c].cat.codes
    new_cols.append(c + "_new")
  
# Create an instance of One-hot-encoder 
enc = OneHotEncoder(handle_unknown='ignore') 
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.fit_transform(df[new_cols]).toarray()) 
  
# Merge with main 
new_df = enc_data.join(df)
new_df.drop(columns_to_encode, axis=1, inplace=True)

tmp_y = pd.DataFrame(new_df["readmission_id"])
new_df.drop(["readmission_id"], axis=1, inplace=True)
new_df.drop(new_cols, axis=1, inplace=True)
new_df = new_df.join(tmp_y)

df = new_df

(64464, 22)


In [305]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [306]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [307]:
# knn_model = KNeighborsClassifier(n_neighbors=8)
# knn_model.fit(X_train, Y_train)

# Y_pred = knn_model.predict(X_test)
# print(accuracy_score(Y_pred, Y_test))

In [308]:
# decision_tree_model = DecisionTreeClassifier(random_state=42)
# decision_tree_model.fit(X_train, Y_train)
# Y_pred = decision_tree_model.predict(X_test)
# accuracy_score(Y_test, Y_pred)

In [309]:
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['log2', 'sqrt', 'none']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in range(1, 13)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [x for x in range(2, 100, 5)]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [x for x in range(3, 15)]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# criterion = ['gini', 'entropy', 'log_loss']
# oob_score = [True, False]
# class_weight = ['balanced', 'balanced_subsample']
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap,
#                'criterion': criterion,
#                'oob_score': oob_score,
#                'class_weight': class_weight}
# pprint(random_grid)

In [310]:
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1, scoring='accuracy')
# # Fit the random search model
# rf_random.fit(X_train, Y_train)

In [311]:
# rf_random.best_params_

In [312]:
# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
    
#     return accuracy
# base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
# base_model.fit(X_train, Y_train)
# base_accuracy = evaluate(base_model, X_test, Y_test)

# best_random = rf_random.best_estimator_
# random_accuracy = evaluate(best_random, X_test, Y_test)

# print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [313]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

In [314]:
# # Fit the grid search to the data
# grid_search.fit(X_train, Y_train)
# grid_search.best_params_
# {'bootstrap': True,
#  'max_depth': 80,
#  'max_features': 3,
#  'min_samples_leaf': 5,
#  'min_samples_split': 12,
#  'n_estimators': 100}
# best_grid = grid_search.best_estimator_
# grid_accuracy = evaluate(best_grid, X_test, Y_test)

# print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [315]:
random_forest_model = RandomForestClassifier(oob_score=False,
 n_estimators=600,
 criterion='gini',
 class_weight='balanced',
 random_state=0)
random_forest_model.fit(X_train, Y_train)

Y_pred = random_forest_model.predict(X_test)
print(accuracy_score(Y_pred, Y_test))

0.5699992243853254


In [316]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_pred, Y_test)

array([[  14,   10,    5],
       [ 488, 1417,  985],
       [ 942, 3114, 5918]], dtype=int64)

# Testing

In [317]:
test_df = pd.read_csv("../../test.csv")
test_df.drop(['payer_code', 'enc_id', 'patient_id'], axis=1, inplace=True)

In [318]:
null_value_percentages=(test_df.isna().sum()/test_df.shape[0])*100
null_value_percentages.sort_values(ascending=False)

weight                      96.898133
max_glu_serum               94.677367
A1Cresult                   83.170652
medical_specialty           49.194235
race                         2.135604
diag_3                       1.421553
diag_2                       0.373403
diag_1                       0.019653
troglitazone                 0.000000
tolbutamide                  0.000000
pioglitazone                 0.000000
rosiglitazone                0.000000
acarbose                     0.000000
miglitol                     0.000000
tolazamide                   0.000000
glipizide                    0.000000
examide                      0.000000
citoglipton                  0.000000
insulin                      0.000000
glyburide-metformin          0.000000
glipizide-metformin          0.000000
glimepiride-pioglitazone     0.000000
metformin-rosiglitazone      0.000000
metformin-pioglitazone       0.000000
change                       0.000000
glyburide                    0.000000
nateglinide 

In [319]:
test_df.drop(['weight', 'max_glu_serum', 'A1Cresult'], axis=1, inplace=True)

In [320]:
test_df['count_changes'] = 0

# Apply the function row-wise
test_df['count_changes'] = test_df.apply(count_changes, axis=1)
test_df.drop(drugs_cols, axis=1, inplace=True)

In [321]:
test_df['count_changes'].value_counts()

0    22196
1     7887
2      413
3       33
4        1
Name: count_changes, dtype: int64

In [322]:
for c in test_df.columns:
    if test_df[c].dtype == np.int64 or test_df[c].dtype == np.float64:
        imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)
    else:
        str_imputer.fit(test_df[c].values.reshape(-1, 1))
        test_df[c] = str_imputer.transform(test_df[c].values.reshape(-1, 1)).reshape(-1,)

In [323]:
diag_1_grouping_indices = test_df[test_df["diag_1"].str.startswith(('E', 'V'))].index
test_df.loc[diag_1_grouping_indices, "diag_1"] = "1000"

test_df['diag_1'] = test_df['diag_1'].astype(float)
test_df['diag_1'] = test_df['diag_1'].apply(change_diagnosis)

In [324]:
diag_2_grouping_indices = test_df[test_df["diag_2"].str.startswith(('E', 'V'))].index
test_df.loc[diag_2_grouping_indices, "diag_2"] = "1000"

test_df['diag_2'] = test_df['diag_2'].astype(float)
test_df['diag_2'] = test_df['diag_2'].apply(change_diagnosis)

In [325]:
diag_3_grouping_indices = test_df[test_df["diag_3"].str.startswith(('E', 'V'))].index
test_df.loc[diag_3_grouping_indices, "diag_3"] = "1000"

test_df['diag_3'] = test_df['diag_3'].astype(float)
test_df['diag_3'] = test_df['diag_3'].apply(change_diagnosis)

In [326]:
test_df['admission_source_id'] = test_df['admission_source_id'].astype(int)
test_df['admission_source_id'] = test_df['admission_source_id'].apply(change_admission_source_id)

In [327]:
test_df['admission_type_id'] = test_df['admission_type_id'].astype(int)
test_df['admission_type_id'] = test_df['admission_type_id'].apply(change_admission_type_id)

In [328]:
test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].astype(int)
test_df['discharge_disposition_id'] = test_df['discharge_disposition_id'].apply(change_discharge_disposition_id)

In [329]:
test_df["age"] = label_encoder.transform(test_df["age"])


In [330]:
# Converting type of columns to category 
for c in columns_to_encode:
    test_df[c] = test_df[c].astype('category')

# Assigning numerical values and storing it in another columns 
new_cols = []
for c in columns_to_encode:
    test_df[c + "_new"] = test_df[c].cat.codes
    new_cols.append(c + "_new")
  
  
# Passing encoded columns 
enc_data = pd.DataFrame(enc.transform(test_df[new_cols]).toarray()) 
  
# Merge with main 
new_df_1 = enc_data.join(test_df)
new_df_1.drop(columns_to_encode, axis=1, inplace=True)

new_df_1.drop(new_cols, axis=1, inplace=True)

test_df = new_df_1

In [331]:
X_test_data = test_df.iloc[:, :].values

In [332]:
Y_pred = random_forest_model.predict(X_test_data)
Y_pred


array([1, 2, 2, ..., 1, 1, 2], dtype=int64)

In [333]:
submit = pd.read_csv("../sample_submission.csv")
submit['readmission_id'] = Y_pred
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,1
1,394919696,2
2,164917446,2
3,178319040,2
4,253585416,2


In [334]:
submit.to_csv("submit_tmp.csv", index = False)

In [335]:
submit["readmission_id"].value_counts()

2    23977
1     6514
0       39
Name: readmission_id, dtype: int64

In [336]:
df["readmission_id"].value_counts()

2    34540
1    22702
0     7222
Name: readmission_id, dtype: int64