In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy as np
import time
import lightning as L
from lightning.pytorch.callbacks import EarlyStopping



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda


In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x25091a71e10>

In [4]:
train_df = pd.read_csv("D:\\Work\wsl\\ML-DS\\fashion-mnist\\fashion-mnist_train.csv")#.head(5000)
test_df = pd.read_csv("D:\\Work\wsl\\ML-DS\\fashion-mnist\\fashion-mnist_test.csv")#.head(1000)

In [5]:
# Converting to Numpy
X_train = train_df.iloc[:,1:].values
y_train = train_df.iloc[:,0].values

X_test = test_df.iloc[:,1:].values
y_test = test_df.iloc[:,0].values

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training set
X_test_scaled = scaler.transform(X_test) 

In [7]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        super().__init__()
        self.features = torch.tensor(features, dtype=torch.float32)#.to(device=device)
        self.labels = torch.tensor(labels, dtype=torch.long)#.to(device=device)
    def __len__(self):
        return len(self.features)
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

train_dataset = CustomDataset(X_train_scaled, y_train)
test_dataset = CustomDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [8]:
num_cls = len(np.unique(y_train))

In [9]:
class MyANNLightning(L.LightningModule):
    def __init__(self, num_features, num_cls, learning_rate=0.001, weight_decay=1e-4):
        super().__init__()
        self.save_hyperparameters()
        
        # Your exact same model architecture
        self.model = nn.Sequential(
            nn.Linear(num_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128), 
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),   
            nn.Dropout(0.3),
            nn.Linear(64, num_cls)
        )
        
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        batch_features, batch_labels = batch
        outputs = self(batch_features)
        loss = self.criterion(outputs, batch_labels)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        batch_features, batch_labels = batch
        outputs = self(batch_features)
        loss = self.criterion(outputs, batch_labels)
        self.log('val_loss', loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(
            self.parameters(), 
            lr=self.hparams.learning_rate, 
            weight_decay=self.hparams.weight_decay
        )
        return optimizer
    
    def on_fit_start(self):
        """Apply weight initialization when training starts"""
        def init_weights(m):
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
        
        self.apply(init_weights)
        print(f"Model initialized with {self.hparams.num_features} features")


# Validation of Training Steps in PyTorch Lightning

In `MyANNLightning` class, do **not** need to call  
```python
optimizer.zero_grad()
loss.backward()
optimizer.step()
```
manually—Lightning handles these under the hood. Here is how Lightning executes each training batch:

1. **`training_step`**  
   - You compute and return the loss.  
   - Lightning automatically:
     - Calls `optimizer.zero_grad()` before we `training_step`.
     - Uses the returned loss to perform `loss.backward()`.
     - Calls `optimizer.step()` after backpropagation.

2. **`configure_optimizers`**  
   - Lightning fetches the optimizer we return here and wires it into its internal loop.  
   - If we need learning‐rate schedulers, we can also return them in a tuple or list, and Lightning will step them appropriately.

3. **Gradient Accumulation, Mixed Precision, Multi-GPU, etc.**  
   - All boilerplate for zeroing gradients, backward passes, gradient clipping, optimizer steps, and moving between optimizers is handled automatically by Lightning’s Trainer.

4. **Validation Loop**  
   - Your `validation_step` computes and logs validation loss.  
   - Lightning accumulates these losses across batches, but never calls backward on them (no gradient steps during validation).

## Putting It All Together

When  run:
```python
trainer = pl.Trainer(...)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
```
Lightning executes internally:

- Loop over epochs:
  - Loop over training batches:
    1. `optimizer.zero_grad()`
    2. Forward pass → your `training_step` → loss returned
    3. `loss.backward()`
    4. `optimizer.step()`
    5. Logging as you’ve set (`self.log("train_loss", ...)`)
  - Loop over validation batches:
    1. Forward pass → your `validation_step` → validation losses logged
    2. No gradient operations

we do not need to—and should not—duplicate these steps manually. Lightning’s whole purpose is to remove this boilerplate so you can focus on model logic, not optimizer bookkeeping.

In [10]:
# Initialize model
model = MyANNLightning(
    num_features=X_train_scaled.shape[1], 
    num_cls=num_cls
) 

In [11]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.0001,
    mode='min',
    verbose=True
)

In [12]:
trainer = L.Trainer(
    max_epochs=1000,                   # Maximum number of epochs to train for
    callbacks=[early_stopping],        # Use early stopping callback to stop training when validation loss stops improving
    accelerator='auto',                # Automatically select 'gpu', 'cpu', or other hardware accelerators if available
    devices='auto',                    # Automatically use all available devices (e.g., all GPUs if available)
    enable_checkpointing=False,        # Disable saving model checkpoints to disk (no files will be written)
    deterministic=True,               # Ensure deterministic training for reproducibility when **True** (may reduce performance)
    log_every_n_steps=1                # Log training metrics every step (very frequent logging)
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
d:\Work\miniconda_env\hf-pytorch\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [13]:
start_time = time.time()
trainer.fit(model, train_loader, test_loader)
end_time = time.time()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | Sequential       | 243 K  | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
243 K     Trainable params
0         Non-trainable params
243 K     Total params
0.975     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode


Model initialized with 784 features


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\Work\miniconda_env\hf-pytorch\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
d:\Work\miniconda_env\hf-pytorch\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.403


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.033 >= min_delta = 0.0001. New best score: 0.370


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.030 >= min_delta = 0.0001. New best score: 0.339


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.013 >= min_delta = 0.0001. New best score: 0.326


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.011 >= min_delta = 0.0001. New best score: 0.316


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0001. New best score: 0.308


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.016 >= min_delta = 0.0001. New best score: 0.292


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0001. New best score: 0.287


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0001. New best score: 0.284


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0001. New best score: 0.282


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0001. New best score: 0.278


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0001. New best score: 0.274


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0001. New best score: 0.270


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 3 records. Best score: 0.270. Signaling Trainer to stop.


In [14]:

elapsed_seconds = end_time - start_time
hours = int(elapsed_seconds // 3600)
minutes = int((elapsed_seconds % 3600) // 60)
seconds = int(elapsed_seconds % 60)
print(f"Elapsed time: {hours} hours, {minutes} minutes, {seconds} seconds")

Elapsed time: 0 hours, 9 minutes, 39 seconds


In [15]:
# trained_state_dict = model.state_dict()
# def get_trained_model():
#     """
#     Returns a fresh LightningModule loaded with trained weights,
#     moved to eval() mode and CPU (or GPU if you .to(device) after).
#     """
#     infer_model = MyANNLightning(
#         num_features=model.hparams.num_features,
#         num_cls=model.hparams.num_cls,
#         lr=model.hparams.lr,
#         weight_decay=model.hparams.weight_decay
#     )
#     infer_model.load_state_dict(trained_state_dict)
#     infer_model.eval()
#     return infer_model
# infer_model = get_trained_model()

In [16]:
model = model.to(device)
model.eval()
total = 0
correct = 0

for batch_features, batch_labels in test_loader:

    batch_features, batch_labels = batch_features.to(device, non_blocking=True), batch_labels.to(device, non_blocking=True)

    outputs = model(batch_features)
    _, pred = torch.max(outputs, 1)
    total += batch_labels.shape[0]
    correct += (pred == batch_labels).sum().item()

print("Total:", total)
print("Corrected:", correct)
print("Accuracy:", (correct/total) * 100)

Total: 10000
Corrected: 8985
Accuracy: 89.85
