In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import gc
import time

device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [62]:
df = pd.read_csv("./processed_data.csv")

In [63]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
X_train = pd.DataFrame(X_train)
Y_train = pd.DataFrame(Y_train)
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)
X_train[150] = Y_train[0]
X_test[150] = Y_test[0]
print(X_train)

           0    1    2    3    4    5    6    7    8    9    ...  139  140  \
0      47903.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0   
1      38912.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...  0.0  0.0   
2      21032.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0  ...  0.0  0.0   
3      69762.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0   
4      11955.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  1.0   
...        ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
56981  21822.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  2.0   
56982  39095.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0   
56983  49046.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0   
56984  34998.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0   
56985  14295.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0   

       141  142  143  144  145   146  147  150  
0      2.0  8.

In [65]:
class ANN(nn.Module):
    def __init__(
        self,
        in_dim: int,
        hidden_dim_1: int,
        hidden_dim_2: int,
        hidden_dim_3: int,
        hidden_dim_4: int,
        n_classes:int = 3,
        dropout: float = 0.3
    ):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(in_features=in_dim, out_features=hidden_dim_1),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_1),
            nn.Dropout(dropout),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_1, out_features=hidden_dim_2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_2),
            nn.Dropout(dropout),
        )
        self.layer3 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_2, out_features=hidden_dim_3),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_3),
            nn.Dropout(dropout),
        )
        self.layer4 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_3, out_features=hidden_dim_4),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_4),
            nn.Dropout(dropout),
        )
        self.output_layer = nn.Linear(in_features=hidden_dim_4, out_features=n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
            Args:
                x (torch.Tensor): (batch_size, in_dim) the input

            Output:
                (torch.Tensor): (batch_size, n_classes) the output
        """
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.output_layer(x)

        return x

In [66]:
class Data(Dataset):
    def __init__(
        self,
        data
    ):
        self.features = torch.tensor(data.iloc[:, 1:-1].values, dtype=torch.float32)
        self.labels = torch.tensor(data.iloc[:, -1].values, dtype=torch.int64)
  
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

    def __len__(self):
        return len(self.features)

In [67]:
train_dataset = Data(data=X_train)
test_dataset = Data(data=X_test)


In [68]:
train_batchsize = 512
val_batchsize = 512

In [69]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=train_batchsize, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=val_batchsize, shuffle=True)

In [70]:
n_epochs = 5

In [71]:
model = ANN(
    in_dim=147,
    hidden_dim_1=147//2,
    hidden_dim_2=147//4,
    hidden_dim_3=147//8,
    hidden_dim_4=147//16
).to(device)

In [72]:
lr = 1e-3
optimiser = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([8.96, 2.86, 1.85]))

In [73]:
sanity_check=False

In [74]:
def train_epoch(
    model,
    dataloader,
    optimiser
):
    model.train()

    for batch in tqdm(dataloader):
        x, y = batch[0], batch[1]

        output = model(x)
        output = nn.Softmax(dim=-1)(output)
        loss = loss_fn(output, y)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        if sanity_check:
            break

def validate(
    model,
    dataloader
):
    model.eval()
    total_loss = 0
    predictions = []
    truths = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            x, y = batch[0], batch[1]

            output = model(x)
            output = nn.Softmax(dim=-1)(output)
            loss = loss_fn(output, y)
            total_loss += loss.detach().cpu().item()/len(dataloader)

            preds = torch.argmax(output, dim=-1)
            predictions.extend(preds.cpu())
            truths.extend(y.cpu())

            if sanity_check:
                break

    acc = accuracy_score(y_true=truths, y_pred=predictions)
    f1 = f1_score(y_true=truths, y_pred=predictions, average='macro')

    return total_loss, acc, f1

In [75]:
def train_model(
    model,
    train_dataloader,
    test_dataloader,
    optimiser,
):
    for epoch in range(1, n_epochs+1):
        start_time = time.time()

        print(f"========= EPOCH {epoch} STARTED =========")
        train_epoch(model=model, dataloader=train_dataloader, optimiser=optimiser)

        print(f"========= TRAIN EVALUATION STARTED =========")
        train_val_op = validate(model=model, dataloader=train_dataloader)

        print(f"========= TEST EVALUATION STARTED =========")
        test_val_op = validate(model=model, dataloader=test_dataloader)

        print(f"END OF {epoch} EPOCH")
        print(f"| Time taken: {time.time() - start_time: 7.3f} |")
        print(f"| Train Loss: {train_val_op[0]: 7.3f} | Train acc: {train_val_op[1]: 1.5f} | Train f1: {train_val_op[2]: 1.5f} |")
        print(f"| Test Loss: {test_val_op[0]: 7.3f}  | Test acc: {test_val_op[1]: 1.5f}  | Test f1: {test_val_op[2]: 1.5f}  |")

        if sanity_check:
            break

In [76]:
train_model(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimiser=optimiser,
)



100%|██████████| 112/112 [00:01<00:00, 64.97it/s]




100%|██████████| 112/112 [00:01<00:00, 84.53it/s]




100%|██████████| 28/28 [00:00<00:00, 59.43it/s]


END OF 1 EPOCH
| Time taken:  12.197 |
| Train Loss:   1.097 | Train acc:  0.38074 | Train f1:  0.31540 |
| Test Loss:   1.097  | Test acc:  0.38570  | Test f1:  0.32026  |


100%|██████████| 112/112 [00:01<00:00, 92.71it/s]




100%|██████████| 112/112 [00:01<00:00, 105.21it/s]




100%|██████████| 28/28 [00:00<00:00, 107.69it/s]


END OF 2 EPOCH
| Time taken:   9.911 |
| Train Loss:   1.096 | Train acc:  0.43948 | Train f1:  0.30827 |
| Test Loss:   1.097  | Test acc:  0.43918  | Test f1:  0.30831  |


100%|██████████| 112/112 [00:01<00:00, 88.73it/s]




100%|██████████| 112/112 [00:01<00:00, 107.28it/s]




100%|██████████| 28/28 [00:00<00:00, 74.98it/s] 


END OF 3 EPOCH
| Time taken:   9.920 |
| Train Loss:   1.096 | Train acc:  0.45230 | Train f1:  0.31836 |
| Test Loss:   1.096  | Test acc:  0.44978  | Test f1:  0.31683  |


100%|██████████| 112/112 [00:01<00:00, 89.09it/s]




100%|██████████| 112/112 [00:01<00:00, 95.71it/s] 




100%|██████████| 28/28 [00:00<00:00, 116.18it/s]


END OF 4 EPOCH
| Time taken:  10.128 |
| Train Loss:   1.095 | Train acc:  0.46596 | Train f1:  0.33109 |
| Test Loss:   1.096  | Test acc:  0.46438  | Test f1:  0.32902  |


100%|██████████| 112/112 [00:01<00:00, 82.66it/s]




100%|██████████| 112/112 [00:01<00:00, 99.64it/s] 




100%|██████████| 28/28 [00:00<00:00, 112.45it/s]


END OF 5 EPOCH
| Time taken:  10.129 |
| Train Loss:   1.095 | Train acc:  0.47445 | Train f1:  0.34207 |
| Test Loss:   1.095  | Test acc:  0.47617  | Test f1:  0.34293  |


In [77]:
test_data = torch.tensor(X_test.iloc[:10, 1:-1].values, dtype=torch.float32)
# print(model.forward(test_data))


In [78]:
# Y_test.iloc[:10,:]

In [79]:
output_probs = nn.Softmax(dim=-1)(model(test_data))
# print("Probabilities:", output_probs)

In [80]:
output_probs

tensor([[0.3362, 0.3952, 0.2686],
        [0.3347, 0.3950, 0.2703],
        [0.3061, 0.3847, 0.3092],
        [0.3135, 0.2262, 0.4603],
        [0.3362, 0.3941, 0.2697],
        [0.3069, 0.3814, 0.3117],
        [0.3337, 0.3008, 0.3655],
        [0.3261, 0.3942, 0.2797],
        [0.3247, 0.3294, 0.3459],
        [0.3168, 0.3500, 0.3332]], grad_fn=<SoftmaxBackward0>)

In [81]:
df = pd.read_csv("./processed_inference.csv")
X_inference = df.iloc[:, 1:].values

In [82]:
X_inference.shape

(30530, 147)

In [83]:
test_data = torch.tensor(X_inference, dtype=torch.float32)

In [84]:
output_probs = nn.Softmax(dim=-1)(model(test_data))

In [85]:
output_probs

tensor([[0.3336, 0.3009, 0.3656],
        [0.3026, 0.1615, 0.5359],
        [0.3361, 0.3975, 0.2664],
        ...,
        [0.3232, 0.3584, 0.3185],
        [0.3273, 0.3956, 0.2771],
        [0.3197, 0.2682, 0.4121]], grad_fn=<SoftmaxBackward0>)

In [86]:
def get_max_idx(l):
    return list(l).index(max(l))

In [87]:
output_predictions = [get_max_idx(x) for x in output_probs]

In [88]:
print(output_predictions.count(0), output_predictions.count(1), output_predictions.count(2))

392 16937 13201


In [89]:
submit = pd.read_csv("./sample_submission.csv")
submit['readmission_id'] = output_predictions
submit.head()

Unnamed: 0,enc_id,readmission_id
0,86305392,2
1,394919696,2
2,164917446,1
3,178319040,1
4,253585416,1


In [90]:
submit.to_csv("submit_tmp.csv", index = False)
submit['readmission_id'].value_counts()

1    16937
2    13201
0      392
Name: readmission_id, dtype: int64