# Pytorch-Lightning Training Flow
- Create our own sample data
- Construct a multilayer neural network classifier
- Setup pytorch-lightning training flow
- Analyze training runs with tensorboard

In [None]:
_ = !pip install torch
_ = !pip install matplotlib
_ = !pip install scikit-learn
_ = !pip install pytorch-lightning
_ = !pip install tensorboard

In [None]:
from typing import Tuple
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer

### Mock data generation
- X: 2D cartesian data
    - x,y coordinates
- Labels:
    - "1" if point inside a circle with radius 0.5
    - "0" if point is outside circle
 
Data should be stratified for best training results, meaning there should be an approximately even number of data points representing "0" and "1" labels.

In [None]:
def create_mock_data(n_records: int, n_features: int) -> Tuple[np.ndarray, np.ndarray]:

    def _is_inside_circle(_x: np.ndarray) -> int:
        if np.sqrt(np.sum(np.square(_x))) < 0.5:
            return 1
        else:
            return 0
    
    _x = (np.random.random(size=(n_records, n_features)) - 0.5)
    _labels = np.apply_along_axis(_is_inside_circle, axis=1, arr=_x)

    return _x, _labels

In [None]:
x, labels = create_mock_data(10000, 2)

In [None]:
plt.scatter(x=x[:, 0], y=x[:, 1], c=labels)

In [None]:
def stratified_sample(_x: np.ndarray, _labels: np.ndarray, n_each: int) -> Tuple[np.ndarray, np.ndarray]:
    unique_labels = np.unique(_labels)
    row_indices = np.arange(_x.shape[0])
    sample_x = []
    sample_labels = []
    for unique_label in unique_labels:
        _idx = np.random.choice(row_indices[np.where(_labels == unique_label)], size=n_each, replace=False)
        sample_x.append(_x[_idx])
        sample_labels.append(_labels[_idx])
        
    return np.concatenate(sample_x), np.concatenate(sample_labels)

In [None]:
def create_stratified_mock_data(n_records: int, n_features: int):
    pool_x, pool_labels = create_mock_data(n_records=10*n_records, n_features=n_features)
    return stratified_sample(pool_x, pool_labels, n_each=n_records//2) 

In [None]:
x, labels = create_stratified_mock_data(n_records=200, n_features=2)

In [None]:
plt.scatter(x=x[:, 0], y=x[:, 1], c=labels)

### Create Pytorch data sets for training and validation

In [None]:
class MockDataset(Dataset):

    def __init__(self, x: np.ndarray, labels: np.ndarray):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.labels = torch.tensor(labels)

    def __getitem__(self, index):
        return self.x[index], self.labels[index]

    def __len__(self):
        return self.x.shape[0]

In [None]:
n_features = 2
training_data_set = MockDataset(*create_stratified_mock_data(n_records=10000, n_features=n_features))
validation_data_set = MockDataset(*create_stratified_mock_data(n_records=500, n_features=n_features))

### Build model with Pytorch-Lightning

In [None]:
LEARNING_RATE = 1e-2
BATCH_SIZE = 32

In [None]:
class MLP(pl.LightningModule):
    def __init__(self, n_features: int, n_hidden_layers: int, nodes_per_layer: int, include_batch_norm: bool):
        super().__init__()
        self.include_batch_norm = include_batch_norm
        self.input_layer = nn.Linear(n_features, nodes_per_layer)
        self.n_hidden_layers = n_hidden_layers
        for i in range(n_hidden_layers):
            setattr(self, f"hidden_layer_{i}", nn.Linear(nodes_per_layer, nodes_per_layer))

            if include_batch_norm:
                setattr(self, f"batch_norm_{i}", nn.BatchNorm1d(nodes_per_layer))

        self.output_layer = nn.Linear(nodes_per_layer, 2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.input_layer(x)
        x = torch.relu(x)
        for i in range(self.n_hidden_layers):
            layer = getattr(self, f"hidden_layer_{i}")
            x = layer(x)
            if self.include_batch_norm:
                batch_norm = getattr(self, f"batch_norm_{i}")
                x = batch_norm(x)
            x = torch.relu(x)
        x = self.output_layer(x)
        return x

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.log("train_loss", loss, on_epoch=True)
        return {"loss": loss}
                
    def validation_step(self, validation_batch, batch_idx):
        x, y = validation_batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.log("validation_loss", loss)
        return {"loss": loss}

### Run Pytorch-Lightning trainer

In [None]:
train_data_loader = DataLoader(training_data_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=7)
validation_data_loader = DataLoader(validation_data_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=7)

In [None]:
trainer = Trainer(max_epochs=15)

In [None]:
model = MLP(n_features=2, n_hidden_layers=2, nodes_per_layer=4, include_batch_norm=True)
trainer.fit(model, train_data_loader, validation_data_loader)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/