## Load Libraries & Data

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [51]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [52]:
train = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/test_data.csv')
dictionary = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv')
sample = pd.read_csv('/kaggle/input/av-healthcare-analytics-ii/healthcare/sample_sub.csv')

## Data Dictionary

In [53]:
dictionary

In [54]:
train.head(5)

In [55]:
train.Stay.value_counts()

In [56]:
train.head(5)

In [57]:
# Distribution of target feature
plt.figure(figsize=(10,7))
train.Stay.value_counts().plot(kind="bar", color = ['Salmon'])

Insight: Most of the patients stay in hospital for **21-30 days**, followed by **11-20**, **31-40**

In [58]:
# Check for unique values in every column
for features in train.columns:
    print('Unique Values for {}'.format(features))
    print(train[features].unique())
    print('======================================')
    print()

**Insights**:
1. Categorical features which need to be encoded:
- `Hospital_region_code`
- `Department`
- `Ward_Type`
- `Admission`
- `Illness`

2. Bins (range) which need to be encoded:
- `Age`
- `Stay` (Target)


In [59]:
# Check for null values
train.isna().sum()

`Bed Grade` and `City_Code_Patient` features have null values to be filled.
- But do we need them?

# 2. Data Processing & Feature engineering

In [60]:
train = train.drop(['Hospital_region_code', 'Bed Grade', 'patientid', 'City_Code_Patient'], axis = 1)
test = test.drop(['Hospital_region_code', 'Bed Grade', 'patientid', 'City_Code_Patient'], axis = 1)

In [61]:
# Combine test and train dataset for processing
combined = [train, test]

### 2a Encoding categorical features

In [62]:
from sklearn.preprocessing import LabelEncoder

for dataset in combined:
    label = LabelEncoder()
    dataset['Department'] = label.fit_transform(dataset['Department'])

In [63]:
combined[1].Department.unique()

In [64]:
# Ward type
for dataset in combined:
    label = LabelEncoder()
    dataset['Hospital_type_code'] = label.fit_transform(dataset['Hospital_type_code'])
    dataset['Ward_Facility_Code'] = label.fit_transform(dataset['Ward_Facility_Code'])
    dataset['Ward_Type'] = label.fit_transform(dataset['Ward_Type'])
    dataset['Type of Admission'] = label.fit_transform(dataset['Type of Admission'])
    dataset['Severity of Illness'] = label.fit_transform(dataset['Severity of Illness'])

In [65]:
combined[0]

### 2b Binning features

In [66]:
combined[1]

In [67]:
# Check age distribution
combined[0].Age.hist()
plt. savefig('100dpi5.png', dpi=100)

In [68]:
combined[0].Age.unique()

In [69]:
age_dict = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4, '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9}

In [70]:
for dataset in combined:
    dataset['Age'] = dataset['Age'].replace(age_dict.keys(), age_dict.values())

In [71]:
combined[0].Stay.unique()

In [72]:
stay_dict = {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4, '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9, 'More than 100 Days': 10}

In [73]:
combined[0]['Stay'] = combined[0]['Stay'].replace(stay_dict.keys(), stay_dict.values())

In [74]:
combined[0].Age.hist()
plt. savefig('100dpi6.png', dpi=100)

In [75]:
for dataset in combined:
    print(dataset.shape)

In [76]:
combined[1].info()

### 2c. Scaling numerical data

In [77]:
columns_list = ['Type of Admission', 'Available Extra Rooms in Hospital', 'Visitors with Patient', 'Admission_Deposit']

In [78]:
len(columns_list)

In [79]:
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()

for dataset in combined:
    dataset[columns_list]= ss.fit_transform(dataset[columns_list].values)


In [80]:
combined[0]

In [81]:
plt.figure(figsize=(12,12))
sns.heatmap(combined[0].corr(), annot=True, cmap='coolwarm')
plt. savefig('100dpi7.png', dpi=100)

# 3. Data Modelling

In [82]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [83]:
train = combined[0]
test = combined[1]

In [84]:
sample

In [85]:
X_train = train.drop(['case_id', 'Stay'], axis=1)
Y_train = train["Stay"]
X_test  = test.drop("case_id", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [86]:
sample.shape

In [87]:
X_test.columns

In [88]:
Y_train

In [89]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

# 4. Submission

In [90]:
sample

In [91]:
submission = pd.DataFrame({
        "case_id": test["case_id"],
        "Stay": Y_pred
})

In [92]:
submission['Stay'] = submission['Stay'].replace(stay_dict.values(), stay_dict.keys())

In [93]:
submission.to_csv('submission.csv', index = False)

In [94]:
sample