## Neural Networks - PyTorch

- Vidhish Trivedi (IMT2021055)
- Barath S Narayan (IMT2021524)
- Vikas Kalyanapuram (IMT2021040)

## Importing Libraries

In [None]:
import numpy as np
#import optuna
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import sklearn
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
# from imblearn.over_sampling import RandomOverSampler, SMOTE
# from imblearn.under_sampling import RandomUnderSampler
from keras.utils import to_categorical

## Loading the data and Feature Engineering

In [None]:
# Load Data
test_df = pd.read_csv("./test.csv")
df = pd.read_csv("./train.csv")

In [None]:
# Combining train and test frequencies
train_frequency = df['patient_id'].value_counts().to_dict()
test_frequency = test_df['patient_id'].value_counts().to_dict()

combined_frequency = {key: train_frequency.get(key, 0) + test_frequency.get(key, 0) for key in set(train_frequency) | set(test_frequency)}

# Add 'frequency' column to both DataFrames
df['frequency_pid'] = df['patient_id'].map(combined_frequency)
test_df['frequency_pid'] = test_df['patient_id'].map(combined_frequency)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Preprocessing

In [None]:
# Extract features and labels
X = df.drop(['patient_id', 'enc_id', 'readmission_id'], axis=1)
# Y = to_categorical(df['readmission_id'])
Y = df['readmission_id']
test_df = test_df.drop(['enc_id', 'patient_id'], axis=1)


numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [None]:
# Impute missing values for numerical columns
numerical_imputer = SimpleImputer(strategy='most_frequent')
X[numerical_columns] = numerical_imputer.fit_transform(X[numerical_columns])
test_df[numerical_columns] = numerical_imputer.transform(test_df[numerical_columns])

# Impute missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_columns] = categorical_imputer.fit_transform(X[categorical_columns])
test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

# One-hot encode categorical columns
categorical_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = categorical_encoder.fit_transform(X[categorical_columns])
test_data_processed = categorical_encoder.transform(test_df[categorical_columns])

# Combine numerical and encoded categorical features
X_final = pd.concat([X[numerical_columns], pd.DataFrame(X_encoded, columns=categorical_encoder.get_feature_names_out(categorical_columns))], axis=1)
test_df_encoded = pd.concat([test_df[numerical_columns], pd.DataFrame(test_data_processed, columns=categorical_encoder.get_feature_names_out(categorical_columns))], axis=1)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from pprint import pprint

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import optuna

import gc
import time

device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Neural Network Class Definition

In [None]:
class ANN(nn.Module):
    def __init__(
        self,
        in_dim: int,
        hidden_dim_1: int,
        hidden_dim_2: int,
        hidden_dim_3: int,
        hidden_dim_4: int,
        n_classes:int = 3,
        dropout: float = 0.3
    ):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(in_features=in_dim, out_features=hidden_dim_1),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_1),
            nn.Dropout(dropout),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_1, out_features=hidden_dim_2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_2),
            nn.Dropout(dropout),
        )
        self.layer3 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_2, out_features=hidden_dim_3),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_3),
            nn.Dropout(dropout),
        )
        self.layer4 = nn.Sequential(
            nn.Linear(in_features=hidden_dim_3, out_features=hidden_dim_4),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim_4),
            nn.Dropout(dropout),
        )
        self.output_layer = nn.Linear(in_features=hidden_dim_4, out_features=n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
            Args:
                x (torch.Tensor): (batch_size, in_dim) the input

            Output:
                (torch.Tensor): (batch_size, n_classes) the output
        """
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.output_layer(x)

        return x

## Dataset Class Definition

In [None]:
class Data(Dataset):
    def __init__(
        self,
        data
    ):
        n = data.shape[1]
        self.features = torch.tensor(data.iloc[:, 0:n-1].values.astype(np.int64), dtype=torch.float32)
        self.labels = torch.tensor(data.iloc[:, -1].values.astype(np.int64), dtype=torch.int64)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

    def __len__(self):
        return len(self.features)

## Generating Train-Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y, test_size=0.2, random_state=0, stratify=Y)
X_train = pd.DataFrame(X_train)
Y_train = pd.DataFrame(Y_train)
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)

X_train = pd.concat([X_train, Y_train], axis=1)
X_test = pd.concat([X_test, Y_test], axis=1)

In [None]:
print(X_train.dtypes)

## Preparing the Data for the NN

In [None]:
train_dataset = Data(data=X_train)
test_dataset = Data(data=X_test)

In [None]:
# Declaring the batch sizes
train_batchsize = 512
val_batchsize = 512

In [None]:
# Creating DataLoaders of the train and test data using batches
train_dataloader = DataLoader(dataset=train_dataset, batch_size=train_batchsize, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=val_batchsize, shuffle=True)

In [None]:
# Number of epochs for training
n_epochs = 15

In [None]:
# Using the number of features of the dataset as the number of input nodes
in_dim = X_final.shape[1]
in_dim

## Declaring the Model instance and associated optimisers and loss function

In [None]:
# Creating the Model instance
model = ANN(
    in_dim=in_dim,
    hidden_dim_1=in_dim//3,
    hidden_dim_2=in_dim//9,
    hidden_dim_3=in_dim//81,
    hidden_dim_4=3
).to(device)

In [None]:
lr = 1e-3 # Learning rate of 0.001
optimiser = torch.optim.Adam(model.parameters(), lr=lr) # Using the adam optimiser

loss_fn = torch.nn.CrossEntropyLoss() # Using cross entropy loss

In [None]:
sanity_check=False

## Functions for training and validating the model

In [None]:
# Training the model within an epoch
def train_epoch(
    model,
    dataloader,
    optimiser
):
    model.train()

    for batch in tqdm(dataloader):
        x, y = batch[0], batch[1]

        output = model(x)
        output = nn.Softmax(dim=-1)(output)
        loss = loss_fn(output, y)

        optimiser.zero_grad()
        loss.backward() # Back propogation
        optimiser.step()

        if sanity_check:
            break

# Validating the model results after training
def validate(
    model,
    dataloader
):
    model.eval()
    total_loss = 0
    predictions = []
    truths = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            x, y = batch[0], batch[1]

            output = model(x)
            output = nn.Softmax(dim=-1)(output)
            loss = loss_fn(output, y)
            total_loss += loss.detach().cpu().item()/len(dataloader)

            preds = torch.argmax(output, dim=-1)
            predictions.extend(preds.cpu())
            truths.extend(y.cpu())

            if sanity_check:
                break

    acc = accuracy_score(y_true=truths, y_pred=predictions)
    f1 = f1_score(y_true=truths, y_pred=predictions, average='macro')

    return total_loss, acc, f1

## Training the model

In [None]:
def train_model(
    model,
    train_dataloader,
    test_dataloader,
    optimiser
):
    for epoch in range(1, n_epochs+1):
        start_time = time.time()

        # Printing the validation results after each epoch
        print(f"========= EPOCH {epoch} STARTED =========")
        train_epoch(model=model, dataloader=train_dataloader, optimiser=optimiser)

        print(f"========= TRAIN EVALUATION STARTED =========")
        train_val_op = validate(model=model, dataloader=train_dataloader)

        print(f"========= TEST EVALUATION STARTED =========")
        test_val_op = validate(model=model, dataloader=test_dataloader)

        print(f"END OF {epoch} EPOCH")
        print(f"| Time taken: {time.time() - start_time: 7.3f} |")
        print(f"| Train Loss: {train_val_op[0]: 7.3f} | Train acc: {train_val_op[1]: 1.5f} | Train f1: {train_val_op[2]: 1.5f} |")
        print(f"| Test Loss: {test_val_op[0]: 7.3f}  | Test acc: {test_val_op[1]: 1.5f}  | Test f1: {test_val_op[2]: 1.5f}  |")

        if sanity_check:
            break

In [None]:
train_model(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimiser=optimiser
)

In [None]:
test_df_encoded.head()

## Making predictions on test data

In [None]:
test_df_encoded = torch.tensor(test_df_encoded.values.astype(np.int64), dtype=torch.float32)
print(test_df_encoded)
predicted_labels = torch.nn.Softmax(dim=-1)(model(test_df_encoded))
predicted_labels = torch.max(predicted_labels.data, 1) # Picking class labels with highest probability 

In [None]:
predicted_labels = predicted_labels.indices.numpy()

In [None]:
subm = pd.read_csv("./sample_submission.csv")
subm["readmission_id"] = predicted_labels
subm.to_csv('./NNsubmission.csv', index=False)