In [1]:
import numpy as np
#import optuna
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import sklearn
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
# from imblearn.over_sampling import RandomOverSampler, SMOTE
# from imblearn.under_sampling import RandomUnderSampler
from keras.utils import to_categorical




In [3]:
test_df = pd.read_csv("./test.csv")
df = pd.read_csv("./train.csv")

In [4]:
train_frequency = df['patient_id'].value_counts().to_dict()
test_frequency = test_df['patient_id'].value_counts().to_dict()

# Combine train and test frequencies
combined_frequency = {key: train_frequency.get(key, 0) + test_frequency.get(key, 0) for key in set(train_frequency) | set(test_frequency)}

# Add 'frequency' column to both DataFrames
df['frequency_pid'] = df['patient_id'].map(combined_frequency)
test_df['frequency_pid'] = test_df['patient_id'].map(combined_frequency)


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [6]:
# Extract features and labels
X = df.drop(['patient_id', 'enc_id', 'readmission_id'], axis=1)
# Y = to_categorical(df['readmission_id'])
Y = df['readmission_id']
test_df = test_df.drop(['enc_id', 'patient_id'], axis=1)


numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [55]:
# Impute missing values for numerical columns
numerical_imputer = SimpleImputer(strategy='most_frequent')
X[numerical_columns] = numerical_imputer.fit_transform(X[numerical_columns])
test_df[numerical_columns] = numerical_imputer.transform(test_df[numerical_columns])

# Impute missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_columns] = categorical_imputer.fit_transform(X[categorical_columns])
test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

# One-hot encode categorical columns
categorical_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = categorical_encoder.fit_transform(X[categorical_columns])
test_data_processed = categorical_encoder.transform(test_df[categorical_columns])

# Combine numerical and encoded categorical features
X_final = pd.concat([X[numerical_columns], pd.DataFrame(X_encoded, columns=categorical_encoder.get_feature_names_out(categorical_columns))], axis=1)
test_df_encoded = pd.concat([test_df[numerical_columns], pd.DataFrame(test_data_processed, columns=categorical_encoder.get_feature_names_out(categorical_columns))], axis=1)




In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import optuna

import gc
import time

device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
class ANN(nn.Module):
    def __init__(
        self,
        in_dim: int,
        hidden_dim_1: int,
        hidden_dim_2: int,
        hidden_dim_3: int,
        hidden_dim_4: int,
        n_classes:int = 3,
        dropout: float = 0.3
    ):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(in_features=in_dim, out_features=hidden_dim_1),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_1),
            nn.Dropout(dropout),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_1, out_features=hidden_dim_2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_2),
            nn.Dropout(dropout),
        )
        self.layer3 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_2, out_features=hidden_dim_3),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_3),
            nn.Dropout(dropout),
        )
        self.layer4 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_3, out_features=hidden_dim_4),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_4),
            nn.Dropout(dropout),
        )
        self.output_layer = nn.Linear(in_features=hidden_dim_4, out_features=n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
            Args:
                x (torch.Tensor): (batch_size, in_dim) the input

            Output:
                (torch.Tensor): (batch_size, n_classes) the output
        """
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.output_layer(x)

        return x

In [27]:
class Data(Dataset):
    def __init__(
        self,
        data
    ):
        n = data.shape[1]
        self.features = torch.tensor(data.iloc[:, 0:n-1].values.astype(np.int64), dtype=torch.float32)
        self.labels = torch.tensor(data.iloc[:, -1].values.astype(np.int64), dtype=torch.int64)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

    def __len__(self):
        return len(self.features)

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y, test_size=0.2, random_state=0, stratify=Y)
X_train = pd.DataFrame(X_train)
Y_train = pd.DataFrame(Y_train)
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)

X_train = pd.concat([X_train, Y_train], axis=1)
X_test = pd.concat([X_test, Y_test], axis=1)

In [29]:
print(X_train.dtypes)

admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
num_lab_procedures            int64
                             ...   
change_Ch                   float64
change_No                   float64
diabetesMed_No              float64
diabetesMed_Yes             float64
readmission_id                int64
Length: 2324, dtype: object


In [30]:
train_dataset = Data(data=X_train)
test_dataset = Data(data=X_test)

In [31]:
train_batchsize = 512
val_batchsize = 512

In [32]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=train_batchsize, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=val_batchsize, shuffle=True)

In [52]:
n_epochs = 15

In [34]:
in_dim = X_final.shape[1]
in_dim

2323

In [46]:
model = ANN(
    in_dim=in_dim,
    hidden_dim_1=in_dim//3,
    hidden_dim_2=in_dim//9,
    hidden_dim_3=in_dim//81,
    hidden_dim_4=3
).to(device)

In [47]:
lr = 1e-3
optimiser = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = torch.nn.CrossEntropyLoss()

In [48]:
sanity_check=False

In [49]:
def train_epoch(
    model,
    dataloader,
    optimiser
):
    model.train()

    for batch in tqdm(dataloader):
        x, y = batch[0], batch[1]

        output = model(x)
        output = nn.Softmax(dim=-1)(output)
        loss = loss_fn(output, y)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        if sanity_check:
            break

def validate(
    model,
    dataloader
):
    model.eval()
    total_loss = 0
    predictions = []
    truths = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            x, y = batch[0], batch[1]

            output = model(x)
            output = nn.Softmax(dim=-1)(output)
            loss = loss_fn(output, y)
            total_loss += loss.detach().cpu().item()/len(dataloader)

            preds = torch.argmax(output, dim=-1)
            predictions.extend(preds.cpu())
            truths.extend(y.cpu())

            if sanity_check:
                break

    acc = accuracy_score(y_true=truths, y_pred=predictions)
    f1 = f1_score(y_true=truths, y_pred=predictions, average='macro')

    return total_loss, acc, f1

In [50]:
def train_model(
    model,
    train_dataloader,
    test_dataloader,
    optimiser
):
    for epoch in range(1, n_epochs+1):
        start_time = time.time()

        print(f"========= EPOCH {epoch} STARTED =========")
        train_epoch(model=model, dataloader=train_dataloader, optimiser=optimiser)

        print(f"========= TRAIN EVALUATION STARTED =========")
        train_val_op = validate(model=model, dataloader=train_dataloader)

        print(f"========= TEST EVALUATION STARTED =========")
        test_val_op = validate(model=model, dataloader=test_dataloader)

        print(f"END OF {epoch} EPOCH")
        print(f"| Time taken: {time.time() - start_time: 7.3f} |")
        print(f"| Train Loss: {train_val_op[0]: 7.3f} | Train acc: {train_val_op[1]: 1.5f} | Train f1: {train_val_op[2]: 1.5f} |")
        print(f"| Test Loss: {test_val_op[0]: 7.3f}  | Test acc: {test_val_op[1]: 1.5f}  | Test f1: {test_val_op[2]: 1.5f}  |")

        if sanity_check:
            break

In [53]:
train_model(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimiser=optimiser
)



100%|██████████| 112/112 [00:07<00:00, 15.52it/s]




100%|██████████| 112/112 [00:04<00:00, 24.29it/s]




100%|██████████| 28/28 [00:00<00:00, 29.28it/s]


END OF 1 EPOCH
| Time taken:  19.073 |
| Train Loss:   0.878 | Train acc:  0.71175 | Train f1:  0.49916 |
| Test Loss:   0.882  | Test acc:  0.70403  | Test f1:  0.49370  |


100%|██████████| 112/112 [00:06<00:00, 16.76it/s]




100%|██████████| 112/112 [00:04<00:00, 25.56it/s]




100%|██████████| 28/28 [00:00<00:00, 30.10it/s]


END OF 2 EPOCH
| Time taken:  18.368 |
| Train Loss:   0.845 | Train acc:  0.72171 | Train f1:  0.49328 |
| Test Loss:   0.851  | Test acc:  0.71371  | Test f1:  0.48664  |


100%|██████████| 112/112 [00:06<00:00, 16.49it/s]




100%|██████████| 112/112 [00:04<00:00, 26.64it/s]




100%|██████████| 28/28 [00:01<00:00, 24.79it/s]


END OF 3 EPOCH
| Time taken:  18.509 |
| Train Loss:   0.838 | Train acc:  0.72480 | Train f1:  0.49750 |
| Test Loss:   0.849  | Test acc:  0.71385  | Test f1:  0.48890  |


100%|██████████| 112/112 [00:09<00:00, 11.80it/s]




100%|██████████| 112/112 [00:06<00:00, 17.10it/s]




100%|██████████| 28/28 [00:01<00:00, 20.01it/s]


END OF 4 EPOCH
| Time taken:  27.535 |
| Train Loss:   0.838 | Train acc:  0.72203 | Train f1:  0.49574 |
| Test Loss:   0.847  | Test acc:  0.71245  | Test f1:  0.48797  |


100%|██████████| 112/112 [00:10<00:00, 10.70it/s]




100%|██████████| 112/112 [00:06<00:00, 16.83it/s]




100%|██████████| 28/28 [00:01<00:00, 19.67it/s]


END OF 5 EPOCH
| Time taken:  28.809 |
| Train Loss:   0.831 | Train acc:  0.72220 | Train f1:  0.49296 |
| Test Loss:   0.841  | Test acc:  0.71266  | Test f1:  0.48464  |


100%|██████████| 112/112 [00:07<00:00, 15.15it/s]




100%|██████████| 112/112 [00:04<00:00, 25.56it/s]




100%|██████████| 28/28 [00:00<00:00, 30.13it/s]


END OF 6 EPOCH
| Time taken:  19.210 |
| Train Loss:   0.833 | Train acc:  0.72103 | Train f1:  0.49378 |
| Test Loss:   0.846  | Test acc:  0.70789  | Test f1:  0.48337  |


100%|██████████| 112/112 [00:07<00:00, 15.32it/s]




100%|██████████| 112/112 [00:04<00:00, 26.32it/s]




100%|██████████| 28/28 [00:00<00:00, 29.71it/s]


END OF 7 EPOCH
| Time taken:  19.437 |
| Train Loss:   0.828 | Train acc:  0.72466 | Train f1:  0.49573 |
| Test Loss:   0.840  | Test acc:  0.71266  | Test f1:  0.48577  |


100%|██████████| 112/112 [00:07<00:00, 14.93it/s]




100%|██████████| 112/112 [00:04<00:00, 26.09it/s]




100%|██████████| 28/28 [00:00<00:00, 30.19it/s]


END OF 8 EPOCH
| Time taken:  19.251 |
| Train Loss:   0.822 | Train acc:  0.73147 | Train f1:  0.50304 |
| Test Loss:   0.839  | Test acc:  0.71322  | Test f1:  0.48905  |


100%|██████████| 112/112 [00:07<00:00, 15.05it/s]




100%|██████████| 112/112 [00:04<00:00, 23.41it/s]




100%|██████████| 28/28 [00:01<00:00, 21.29it/s]


END OF 9 EPOCH
| Time taken:  20.048 |
| Train Loss:   0.824 | Train acc:  0.72866 | Train f1:  0.50180 |
| Test Loss:   0.839  | Test acc:  0.71399  | Test f1:  0.49042  |


100%|██████████| 112/112 [00:07<00:00, 14.05it/s]




100%|██████████| 112/112 [00:04<00:00, 23.79it/s]




100%|██████████| 28/28 [00:00<00:00, 28.27it/s]


END OF 10 EPOCH
| Time taken:  20.240 |
| Train Loss:   0.819 | Train acc:  0.73261 | Train f1:  0.50370 |
| Test Loss:   0.837  | Test acc:  0.71470  | Test f1:  0.48995  |


100%|██████████| 112/112 [00:08<00:00, 13.72it/s]




100%|██████████| 112/112 [00:04<00:00, 23.84it/s]




100%|██████████| 28/28 [00:00<00:00, 29.76it/s]


END OF 11 EPOCH
| Time taken:  20.077 |
| Train Loss:   0.822 | Train acc:  0.72921 | Train f1:  0.49954 |
| Test Loss:   0.838  | Test acc:  0.71350  | Test f1:  0.48705  |


100%|██████████| 112/112 [00:07<00:00, 14.14it/s]




100%|██████████| 112/112 [00:04<00:00, 22.90it/s]




100%|██████████| 28/28 [00:01<00:00, 24.85it/s]


END OF 12 EPOCH
| Time taken:  20.472 |
| Train Loss:   0.820 | Train acc:  0.73094 | Train f1:  0.50106 |
| Test Loss:   0.838  | Test acc:  0.71329  | Test f1:  0.48688  |


100%|██████████| 112/112 [00:08<00:00, 13.78it/s]




100%|██████████| 112/112 [00:04<00:00, 24.02it/s]




100%|██████████| 28/28 [00:00<00:00, 28.95it/s]


END OF 13 EPOCH
| Time taken:  20.328 |
| Train Loss:   0.817 | Train acc:  0.73361 | Train f1:  0.50402 |
| Test Loss:   0.838  | Test acc:  0.71336  | Test f1:  0.48824  |


100%|██████████| 112/112 [00:08<00:00, 13.42it/s]




100%|██████████| 112/112 [00:04<00:00, 23.81it/s]




100%|██████████| 28/28 [00:00<00:00, 31.78it/s]


END OF 14 EPOCH
| Time taken:  20.510 |
| Train Loss:   0.823 | Train acc:  0.72749 | Train f1:  0.49738 |
| Test Loss:   0.841  | Test acc:  0.70971  | Test f1:  0.48301  |


100%|██████████| 112/112 [00:07<00:00, 14.14it/s]




100%|██████████| 112/112 [00:05<00:00, 20.24it/s]




100%|██████████| 28/28 [00:01<00:00, 25.10it/s]


END OF 15 EPOCH
| Time taken:  21.366 |
| Train Loss:   0.822 | Train acc:  0.72887 | Train f1:  0.49885 |
| Test Loss:   0.840  | Test acc:  0.71140  | Test f1:  0.48491  |


In [56]:
test_df_encoded.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,5,5,17,12,25,3,29,0,0,4,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,2,3,7,3,59,0,24,0,1,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,3,1,1,8,47,0,23,0,0,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1,3,7,10,46,6,38,0,2,7,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1,6,7,5,64,1,16,0,0,1,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [57]:
test_df_encoded = torch.tensor(test_df_encoded.values.astype(np.int64), dtype=torch.float32)
print(test_df_encoded)
# Assuming model is your PyTorch model
predicted_labels = torch.nn.Softmax(dim=-1)(model(test_df_encoded))
predicted_labels = torch.max(predicted_labels.data, 1)

tensor([[ 5.,  5., 17.,  ...,  1.,  0.,  1.],
        [ 2.,  3.,  7.,  ...,  0.,  0.,  1.],
        [ 3.,  1.,  1.,  ...,  0.,  0.,  1.],
        ...,
        [ 1.,  3.,  7.,  ...,  0.,  0.,  1.],
        [ 1.,  3.,  7.,  ...,  0.,  0.,  1.],
        [ 1.,  1.,  7.,  ...,  1.,  0.,  1.]])


In [58]:
predicted_labels = predicted_labels.indices.numpy()

In [59]:
subm = pd.read_csv("./sample_submission.csv")
subm["readmission_id"] = predicted_labels
subm.to_csv('./NNsubmission.csv', index=False)