Connected to itmo_dl_course (Python 3.12.8)

In [1]:
import os
import random

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_lightning.callbacks import (EarlyStopping, ModelCheckpoint,
                                         TQDMProgressBar)
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from torchmetrics import AUROC

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)


set_seed(42)

In [3]:
class SmokingDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = (
            torch.FloatTensor(targets.values) if targets is not None else None
        )

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

In [4]:
class SmokingPredictor(pl.LightningModule):
    def __init__(self, input_size):
        super().__init__()
        self.save_hyperparameters()

        self.layer1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)

        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.2)

        self.layer3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.1)

        self.output = nn.Linear(32, 1)

        self.train_auroc = AUROC(task="binary")
        self.val_auroc = AUROC(task="binary")
        self.test_auroc = AUROC(task="binary")

    def forward(self, x):
        x = self.dropout1(self.bn1(F.relu(self.layer1(x))))
        x = self.dropout2(self.bn2(F.relu(self.layer2(x))))
        x = self.dropout3(self.bn3(F.relu(self.layer3(x))))
        return torch.sigmoid(self.output(x))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy(y_hat, y.view(-1, 1))
        self.train_auroc(y_hat, y.int())
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_auroc", self.train_auroc, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy(y_hat, y.view(-1, 1))
        self.val_auroc(y_hat, y.int())
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("val_auroc", self.val_auroc, on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        self.test_auroc(y_hat, y.int())
        self.log("test_auroc", self.test_auroc, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=0.01)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="max", factor=0.5, patience=5, verbose=False
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_auroc",
                "frequency": 1,
            },
        }

In [5]:
def prepare_data(train_path, test_path=None):
    train_df = pd.read_csv(train_path)

    X = train_df.drop(["id", "smoking"], axis=1)
    y = train_df["smoking"]

    X = X.fillna(X.mean())

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_temp, X_holdout, y_temp, y_holdout = train_test_split(
        X_scaled, y, test_size=0.1, random_state=42, stratify=y
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp
    )

    train_dataset = SmokingDataset(X_train, y_train)
    val_dataset = SmokingDataset(X_val, y_val)
    holdout_dataset = SmokingDataset(X_holdout, y_holdout)

    if test_path:
        test_df = pd.read_csv(test_path)
        test_ids = test_df["id"]
        X_test = test_df.drop(["id"], axis=1)
        X_test = X_test.fillna(X_test.mean())
        X_test_scaled = scaler.transform(X_test)
        submission_dataset = SmokingDataset(X_test_scaled)
        return (
            train_dataset,
            val_dataset,
            holdout_dataset,
            submission_dataset,
            test_ids,
            X.shape[1],
        )

    return train_dataset, val_dataset, holdout_dataset, None, None, X.shape[1]

In [6]:
def train_model(train_dataset, val_dataset, input_size):
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    model = SmokingPredictor(input_size)

    early_stop_callback = EarlyStopping(
        monitor="val_auroc", mode="max", patience=10, verbose=False
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="val_auroc",
        mode="max",
        filename="best-checkpoint",
        save_top_k=1,
        verbose=False,
    )

    logger = TensorBoardLogger(
        save_dir="lightning_logs", name="smoking_prediction", log_graph=True
    )

    trainer = pl.Trainer(
        max_epochs=100,
        callbacks=[
            early_stop_callback,
            checkpoint_callback,
            TQDMProgressBar(refresh_rate=1),
        ],
        logger=logger,
        accelerator="auto",
        devices=1,
        enable_progress_bar=True,
        enable_model_summary=False,
    )

    trainer.fit(model, train_loader, val_loader)
    best_model = SmokingPredictor.load_from_checkpoint(
        checkpoint_callback.best_model_path
    )

    return best_model, trainer, checkpoint_callback

In [7]:
def make_predictions(model, test_dataset, test_ids):
    test_loader = DataLoader(test_dataset, batch_size=32)

    predictions = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            pred = model(batch).cpu().numpy()
            predictions.extend(pred.flatten())

    submission = pd.DataFrame({"id": test_ids, "smoking": predictions})

    return submission

In [8]:
(
    train_dataset,
    val_dataset,
    holdout_dataset,
    submission_dataset,
    test_ids,
    input_size,
) = prepare_data("train.csv", "test.csv")

model, trainer, checkpoint_callback = train_model(
    train_dataset, val_dataset, input_size
)

holdout_loader = DataLoader(holdout_dataset, batch_size=32)
test_results = trainer.test(model, holdout_loader)
print(f"Test AUROC on holdout: {test_results[0]['test_auroc']:.4f}")

submission = make_predictions(model, submission_dataset, test_ids)
submission.to_csv("submission.csv", index=False)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/reveur/anaconda3/envs/itmo_dl_course/lib/python3.12/site-packages/pytorch_lightning/loggers/tensorboard.py:195: Could not log computational graph to TensorBoard: The `model.example_input_array` attribute is not set or `input_array` was not given.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/reveur/anaconda3/envs/itmo_dl_course/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/reveur/anaconda3/envs/itmo_dl_course/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/reveur/anaconda3/envs/itmo_dl_course/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.8812949657440186
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Test AUROC on holdout: 0.8813


In [None]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/ --host 0.0.0.0 --port 6006