In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Stay'].nunique()

In [None]:
df['Stay'].unique()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Stay',data=df)

# DROPPING SOME UNNECCESSARY COLUMNS

In [None]:
df.drop(['case_id','Hospital_code'],axis=1,inplace=True)

# make a function and create another column as 'stay_category'(number it from 1 to 11)...make it int..then use df.corr()

In [None]:
def stay_cat_encoder(x):
    if x == '0-10':
        return 0
    elif x == '11-20':
        return 1
    elif x == '21-30':
        return 2
    elif x == '31-40':
        return 3
    elif x == '41-50':
        return 4
    elif x == '51-60':
        return 5
    elif x == '61-70':
        return 6
    elif x == '71-80':
        return 7
    elif x == '81-90':
        return 8
    elif x == '91-100':
        return 9
    elif x == 'More than 100 Days':
        return 10

In [None]:
df['Stay_cat'] = df['Stay'].apply(stay_cat_encoder)

In [None]:
df.corr()['Stay_cat'].sort_values()[:-1]

In [None]:
df.head()

# Let's get rid of unneccessary columns
# City_Code_Patient , City_Code_Hospital, patientid 

In [None]:
df.drop(['City_Code_Hospital','patientid','City_Code_Patient'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['Hospital_type_code'].nunique()

In [None]:
df['Hospital_type_code'].unique()

In [None]:
df['Hospital_region_code'].unique()

In [None]:
df.isnull().sum()

# Bed Grade has some missing values...let's try filling those..since only less than 0.4% of data is missing we can fit missing values with the median/mean as it won't affect accuracy much

In [None]:
11300/(len(df)-113) # percentage of missing data in Bed Grade column

In [None]:
df['Bed Grade'].median()

# we'll go with median

In [None]:
def imputer_Bed_Grade(x):
    if pd.isnull(x):
        return 3
    else:
        return x

In [None]:
df['Bed Grade'] = df['Bed Grade'].apply(imputer_Bed_Grade)

In [None]:
df.isnull().sum()

In [None]:
df['Bed Grade'].unique()

# MISSING DATA HAS BEEN TAKEN CARE OF

In [None]:
df['Age'].nunique()

In [None]:
df['Age'].unique()

In [None]:
def Age_cat_encoder(x):
    if x == '0-10':
        return 1
    elif x == '11-20':
        return 2
    elif x == '21-30':
        return 3
    elif x == '31-40':
        return 4
    elif x == '41-50':
        return 5
    elif x == '51-60':
        return 6
    elif x == '61-70':
        return 7
    elif x == '71-80':
        return 8
    elif x == '81-90':
        return 9
    elif x == '91-100':
        return 10

In [None]:
df['Age_cat'] = df['Age'].apply(Age_cat_encoder)

In [None]:
df.corr()['Stay_cat'].sort_values()[:-1]

In [None]:
df.select_dtypes(include=['object']).columns

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df3 = df.copy()

In [None]:
df3.drop('Age',axis=1,inplace=True)

In [None]:
df3.select_dtypes(include=['object']).columns

# NOW LET'S DO LABEL ENCODING OF ALL THESE OBJECT COLUMNS

In [None]:
df3['Hospital_type_code'] = le.fit_transform(df3['Hospital_type_code'])
df3['Hospital_region_code'] = le.fit_transform(df3['Hospital_region_code'])
df3['Department'] = le.fit_transform(df3['Department'])
df3['Ward_Type'] = le.fit_transform(df3['Ward_Type'])
df3['Ward_Facility_Code'] = le.fit_transform(df3['Ward_Facility_Code'])
df3['Type of Admission'] = le.fit_transform(df3['Type of Admission'])
df3['Severity of Illness'] = le.fit_transform(df3['Severity of Illness'])

In [None]:
df3.info()

In [None]:
df3.corr()['Stay_cat'].sort_values()[:-1]

In [None]:
X = df3.drop(['Stay_cat','Stay'],axis=1)
y = df3['Stay']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train.shape

# 1) LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(fit_intercept=True,n_jobs=-1)

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,lr_pred))
print('\n')
print(classification_report(y_test,lr_pred))

# LR = 38% ; can't use thise model as this model couldn't calculate F-1 score properly

# 2) RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=100)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test,rf_pred))
print('\n')
print(confusion_matrix(y_test,rf_pred))

# RF = 35% with n=100 (it is the same even when n=500)

# 3) KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=100,n_jobs=-1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn_pred = knn.predict(X_test)
print(classification_report(y_test,knn_pred))
print('\n')
print(confusion_matrix(y_test,knn_pred))

# KNN ACCURACY = 38% with k=100

# 3) CATBOOST

In [None]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(random_state=42,use_best_model=True,iterations=1000)
cb.fit(X_train,y_train,use_best_model=True,verbose=100,eval_set=(X_test,y_test))

In [None]:
cb_pred = cb.predict(X_test)
print(classification_report(y_test,cb_pred))
print('\n')
print(confusion_matrix(y_test,cb_pred))

# CatBoost = 41% ACCURACY
# So far, CatBoost seems to be the best model