<h1> Deep Learning and Neural Networks </h1>

In [412]:
import torch
import torch.nn as nn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from pathlib import Path

sns.set_theme(style="dark")
sns.set_context("notebook", font_scale=1.5)
plt.rcParams["figure.figsize"] = (16, 8)

<h2> Data Analysis </h2>

In [413]:
data_path = Path.cwd().parent.parent / "data" / "diabetes_india" / "diabetes.csv"
df = pd.read_csv(data_path)

In [None]:
df

In [None]:
df.describe()

In [None]:
df["Age"].hist();

In [None]:
df["Glucose"].hist();   

In [None]:
df["BMI"].hist();

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

In [None]:
df["Outcome"].value_counts()

<h2> Linear vs non-linear compression  </h2>

In [None]:
X = torch.Tensor(df.to_numpy()[:,:-1])
y = torch.Tensor(df.to_numpy()[:,-1])
N, d = X.shape
N, d

In [422]:
def train_test_val_split(X, y):

    N, _ = X.shape
    idx = torch.randperm(N)
    train_idx = idx[:int(0.8*N)]
    val_idx = idx[int(0.8*N):int(0.9*N)]
    test_idx = idx[int(0.9*N):]

    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [423]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val_split(X, y)

In [424]:
def pca(X: torch.Tensor, out_dim: int) -> torch.Tensor:

    X = (X - X.mean(dim=0)) / X.std(dim=0)
    sigma = X.T @ X
    lam, V = torch.linalg.eig(sigma)
    lam, V = lam.real, V.real
    idx = torch.argsort(lam, descending=True)
    lam = lam[idx]
    V = V[:, idx]

    return X @ V[:,:out_dim]

In [425]:
X_train_pca = pca(X_train, 2)
X_val_pca = pca(X_val, 2)
X_test_pca = pca(X_test, 2)

In [426]:
def center_and_normalize(X):
    return (X - X.mean(dim=0)) / X.std(dim=0)

def train_classifier(
        X_train, 
        y_train,
        X_val,
        y_val, 
        model, 
        epochs=10000, 
        lr=0.01,
        early_stop=10):

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_losses, val_losses = [], []
    min_val_loss, early_stop_counter, best_model = float("inf"), 0, None

    X_train = center_and_normalize(X_train)
    X_val = center_and_normalize(X_val)

    for _ in tqdm(range(epochs)):

        if early_stop_counter == early_stop:
            print("Early stopping")
            break

        optimizer.zero_grad()
        logits = model(X_train).squeeze()
        loss = criterion(logits, y_train)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

        # validation
        with torch.no_grad():

            y_pred_val = model(X_val).squeeze()
            val_loss = criterion(y_pred_val, y_val)
            val_losses.append(val_loss.item())

            min_val_loss = min(min_val_loss, val_loss.item())
            if min_val_loss == val_loss.item():
                best_model = model
                early_stop_counter = 0

                probs = torch.sigmoid(y_pred_val)
                preds = (probs > 0.5).float()

                metrics = {
                    "accuracy": accuracy_score(y_val, preds),
                    "precision": precision_recall_fscore_support(y_val, preds, average="binary")[0],
                    "recall": precision_recall_fscore_support(y_val, preds, average="binary")[1],
                    "f1": precision_recall_fscore_support(y_val, preds, average="binary")[2]
                }

            else:
                early_stop_counter += 1

    return best_model, metrics, train_losses, val_losses

In [427]:
class LogisticRegressor(nn.Module):

    def __init__(self, in_dim):
        super().__init__()

        self.linear = nn.Linear(in_dim, 1)

    def forward(self, x):

        return self.linear(x)
    

class MLP(nn.Module):

    def __init__(self, in_dim):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 2),
            nn.ReLU(),
            nn.Linear(2, 2),
            nn.ReLU(),
            nn.Linear(2, 2),
            nn.ReLU(),
            nn.Linear(2, 2),
            nn.ReLU(),
            nn.Linear(2, 1)
        )

    def forward(self, x):
        return self.mlp(x)

In [None]:
logistic = LogisticRegressor(2)
logistic, metrics, train_losses, valid_losses = train_classifier(X_train_pca, y_train, X_val_pca, y_val, logistic)
metrics

In [None]:
mlp = MLP(2)
mlp, metrics, train_losses, valid_losses = train_classifier(X_train_pca, y_train, X_val_pca, y_val, mlp)
metrics

In [430]:
logistic_probs = logistic(center_and_normalize(X_test_pca)).squeeze()
mlp_probs = logistic(center_and_normalize(X_test_pca)).squeeze()

In [None]:
_, ax = plt.subplots(ncols=3)

ax[0].scatter(
    X_test_pca[:, 0].detach().numpy(), 
    X_test_pca[:, 1].detach().numpy(),
    c=logistic_probs.detach().cpu().numpy(),
    cmap='coolwarm');
ax[0].set_title("Logistic predictions")

ax[1].scatter(
    X_test_pca[:, 0].detach().numpy(),
    X_test_pca[:, 1].detach().numpy(),
    c=mlp_probs.detach().cpu().numpy(),
    cmap='coolwarm');
ax[1].set_title("Neural predictions")

ax[2].scatter(
    X_test_pca[:, 0].detach().numpy(), 
    X_test_pca[:, 1].detach().numpy(), 
    c=y_test.detach().cpu().numpy(),
    cmap='coolwarm');

ax[2].set_title("True labels")
