# Deep Learning With PyTorch
**Deep Learning** is a subset of *Machine Learning* where the fundemental structure is a network of inputs, ($\geq 1$) hidden layers, and outputs. The original intuition of deep learning was modelling how the human brain learns; through inter-connected neurons. One of the most popular deep learning frameworks is ***Pytorch*** ,`torch`, sharing similarities with the `numpy` library (instead uses tensors `torch.tensor([[1, 2, 3], [4, 5, 6]])` instead of arrays/matrices).
- Element-wise operations ($X_{a\times b}\cdot Y_{a\times b}$):`x + y` (addition), `x - y` (subtraction), `x * y` (multiplication), `torch.div(x,y)` (division)
- Matrix operations ($X_{a\times b}\cdot Y_{b\times c}$): `x @ y` (multiplication), `torch.linalg.inv(x)` (matrix inverse)

## Neural Network (NN)
### OOP deep learning model

In [None]:
import torch.nn as nn
import torch.nn.init as init

# OOP model to replace:
# net = nn.Sequential(nn.Linear(9,16), 
#               nn.ReLU(), 
#               nn.Linear(16,8), 
#               nn.ReLU(), 
#               nn.Linear(8,1), 
#               nn.Sigmoid())

class Net(nn.Module):
    def __init__(self): # network definition
        super().__init__()
        self.fc1 = nn.Linear(9,16)
        self.fc2 = nn.Linear(16,8)
        self.fc3 = nn.Linear(8,1)

    def forward(self, x): # forward pass
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

net = Net()


## solutions for unstable (vanishing/exploding) gradients:
class Net2(Net):
    # Batch normalization: (normalize the layers' outputs, scale and shift normalized outputs using learned parameters)
    # - model learns optimal inputs distribution for each layer
    #   - faster loss decrease
    #   - helps against unstable gradients
    def __init__(self):
        super().__init__()
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(8)
        self._weight_initialization()

    # Proper weight initialization: (dependent on activation function)
    # - ensures variance of layer inputs = variance of layer outputs
    # - ensures variance of gradients is the same before and after a layer
    def _weight_initialization(self):
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity="sigmoid")

    # Activation functions with non-zero gradients for negative values:(ELU, etc.)
    # - help against dying neurons (become 0 and never change after)
    # - average output > 0 and thereby help against vanishing gradients
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.functional.elu(x)
        # = nn.functional.leaky_relu(x, negative_slope=0.05) # ReLU where slope for x < 0 is != 0
        x = self.fc3(x)
        x = nn.functional.sigmoid(x)
        return x

### DataLoader setup

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class CsvDataset(Dataset):
    def __init__(self, csv_path):
        super().__init__()
        df = pd.read_csv(csv_path)
        self.data = df.to_numpy()

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, indx):
        features = self.data[indx, :-1]
        label = self.data[indx, -1]
        return features, label
    
train_dataset = CsvDataset('train.csv')
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

### Model Training

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.95) # stochastic gradient descent
#         = optim.Adagrad(net.parameters(), lr=0.01) # adaptive gradient descent (different learning rate for each parameter)
#         = optim.RMSprop(net.parameters(), lr=0.01) # root mean square propagation (update for each parameter based on the size of previous gradient)
#         = optim.Adam(net.parameters(), lr=0.01) # adaptive moment estimation (RMSprop + gradient momentum)

for epoch in range(1000):
    for features, label in train_dataloader:
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, label.view(-1,1))
        loss.backward()
        optimizer.step()

### Evaluation Procedure

In [None]:
import torch
from torchmetrics import Accuracy

test_dataset = CsvDataset('test.csv')
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=True)

acc = Accuracy(task="binary")

net.eval()
with torch.no_grad():
    for features, label in test_dataloader:
        outputs = net(features)
        pred = (outputs >= 0.5).float()
        acc(pred, label.view(-1,1))

accuracy = acc.compute()
print(f"accuracy: {accuracy}")

## Image Convolutional NN

### Data Augmentation

In [None]:
from torchvision.datasets import ImageFolder
from torchvision import transforms

# Generating more data by applying random transformations to the original images
# - increase size and diversity of training set
# - improve model robustness
# - reduces overfitting
# Note that transformation need to be logical to the data and task (i.e. if designed to detect lemons vs. limes, 
# cannot alter the color as label would not be accurate anymore)
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.RandomAutocontrast(),
    # ...
    transforms.ToTensor(),
    transforms.Resize((64,64))
])
train_dataset = ImageFolder("./train", transform=train_transforms)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
image, label = next(iter(train_dataloader))
image = image.squeeze().permute(1, 2, 0) # alter dimensions of image to correctly display it

# Note that there should be no data augmentation for test data

### Convolutional Layers

In [None]:
## Normal Convolutional Layers
# Slide overlapping filter(s) of parameters over the input, performing convolution (dot-product) at each position resulting in a feature map (one 
# filter = one feature map); preserving spatial patters from input, and uses fewer parameters than linear layers. Activation functions are then 
# applied to each feature map (similar to normal NN), and all maps are combined to form an output.
# - a frame of 0s can be added to convolutional layer's input to ensure border pixels are treated equally to others.

## MaxPooling
# Slide non-overlapping window(s) over the input, and at each position retaining only the maximum value (used after convolutional layers to reduce 
# spatial dimensions).

class Net(nn.Module):
    def __init__(self, num_classes): # network definition
        super().__init__()
        self.cl1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.cl2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.cl3 = nn.MaxPool2d(kernel_size=2)
        self.cl4 = nn.Flatten()

        self.fc1 = nn.Linear(16*16*64,num_classes)

    def forward(self, x): # forward pass
        x = self.cl1(x) # 64x64x3 -> 64x64x32
        x = nn.functional.elu(x) 
        x = self.cl3(x) # 64x64x32 -> 32x32x32
        x = self.cl2(x) # 32x32x32 -> 32x32x64
        x = nn.functional.elu(x)
        x = self.cl3(x) # 32x32x64 -> 16x16x64
        x = self.cl4(x) # 16x16x64 -> 16384
        x = self.fc1(x)
        return x