# Part A: Trainer Class

In [24]:
import torch
import torch.nn as nn

class Trainer:
    def __init__(self, model, optimizer, criterion, device):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

# Part B: Gradiant Clipping

In [25]:
import pandas as pd

def get_grad_norm(parameters):
    total_norm = 0.0
    for p in parameters:
        if p.grad is not None:
            total_norm += p.grad.norm(2)
    
    total_norm = total_norm ** 0.5
    return total_norm.item()


data_df = pd.read_csv("../data/BTCUSDT.csv", sep='|', nrows=1000, header=None, usecols=[1, 2, 3, 4, 5],
                       names=['open', 'high', 'low', 'close', 'volume'])
data_tensor = torch.tensor(data_df.values, dtype=torch.float32).to('cuda')
target = torch.tensor(torch.ones((1000, 1)), dtype=torch.float32).to('cuda') 


model = nn.Linear(5, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.000001)
criterion = nn.MSELoss()


trainer = Trainer(model, optimizer, criterion, "cuda")

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch}, loss:{loss}, grad_norm: {get_grad_norm(model.parameters())}")

epoch: 0, loss:1279875.75, grad_norm: 4443.95849609375
epoch: 1, loss:29309939712.0, grad_norm: 54667.90625
epoch: 2, loss:671219703611392.0, grad_norm: 672504.0625
epoch: 3, loss:1.5371443036168913e+19, grad_norm: 8272892.0
epoch: 4, loss:3.5201763189153054e+23, grad_norm: 101770016.0
epoch: 5, loss:8.061471542676563e+27, grad_norm: 1251936512.0
epoch: 6, loss:1.8461382397331074e+32, grad_norm: inf
epoch: 7, loss:inf, grad_norm: inf
epoch: 8, loss:inf, grad_norm: inf
epoch: 9, loss:inf, grad_norm: inf
epoch: 10, loss:inf, grad_norm: inf
epoch: 11, loss:inf, grad_norm: inf
epoch: 12, loss:inf, grad_norm: inf
epoch: 13, loss:inf, grad_norm: inf
epoch: 14, loss:inf, grad_norm: inf
epoch: 15, loss:inf, grad_norm: inf
epoch: 16, loss:inf, grad_norm: nan
epoch: 17, loss:nan, grad_norm: nan
epoch: 18, loss:nan, grad_norm: nan
epoch: 19, loss:nan, grad_norm: nan


  target = torch.tensor(torch.ones((1000, 1)), dtype=torch.float32).to('cuda')


In [None]:
class Trainer:
    def __init__(self, model, optimizer, criterion, device, max_grad_norm=1.0):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_grad_norm = max_grad_norm

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
        self.optimizer.step()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

In [57]:
model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch},\t loss:{loss:.3f},\t grad_norm: {get_grad_norm(model.parameters()):.3f}")

epoch: 0,	 loss:183512.297,	 grad_norm: 1.000
epoch: 1,	 loss:116358.844,	 grad_norm: 1.000
epoch: 2,	 loss:64438.531,	 grad_norm: 1.000
epoch: 3,	 loss:27751.252,	 grad_norm: 1.000
epoch: 4,	 loss:6296.945,	 grad_norm: 1.000
epoch: 5,	 loss:75.647,	 grad_norm: 1.000
epoch: 6,	 loss:6296.944,	 grad_norm: 1.000
epoch: 7,	 loss:75.647,	 grad_norm: 1.000
epoch: 8,	 loss:6296.944,	 grad_norm: 1.000
epoch: 9,	 loss:75.648,	 grad_norm: 1.000
epoch: 10,	 loss:6296.943,	 grad_norm: 1.000
epoch: 11,	 loss:75.648,	 grad_norm: 1.000
epoch: 12,	 loss:6296.945,	 grad_norm: 1.000
epoch: 13,	 loss:75.647,	 grad_norm: 1.000
epoch: 14,	 loss:6296.945,	 grad_norm: 1.000
epoch: 15,	 loss:75.647,	 grad_norm: 1.000
epoch: 16,	 loss:6296.944,	 grad_norm: 1.000
epoch: 17,	 loss:75.647,	 grad_norm: 1.000
epoch: 18,	 loss:6296.940,	 grad_norm: 1.000
epoch: 19,	 loss:75.647,	 grad_norm: 1.000


# Part C: Mixed Precision

In [None]:
class Trainer:
    def __init__(self, model, optimizer, criterion, device, max_grad_norm=1.0):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_grad_norm = max_grad_norm

        self.scaler = torch.amp.GradScaler()

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()

        with torch.autocast(device_type=self.device, dtype=torch.float16):
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)

        self.scaler.scale(loss).backward()
        self.scaler.unscale_(self.optimizer)
        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
        self.scaler.step(self.optimizer)
        self.scaler.update()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

In [97]:
model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch},\t loss:{loss:.3f},\t grad_norm: {get_grad_norm(model.parameters()):.3f}")

epoch: 0,	 loss:1166346.125,	 grad_norm: nan
epoch: 1,	 loss:1166346.125,	 grad_norm: nan
epoch: 2,	 loss:1166346.125,	 grad_norm: nan
epoch: 3,	 loss:1166346.125,	 grad_norm: nan
epoch: 4,	 loss:1166346.125,	 grad_norm: nan
epoch: 5,	 loss:1166346.125,	 grad_norm: nan
epoch: 6,	 loss:1166346.125,	 grad_norm: nan
epoch: 7,	 loss:1166346.125,	 grad_norm: nan
epoch: 8,	 loss:1166346.125,	 grad_norm: nan
epoch: 9,	 loss:1166346.125,	 grad_norm: nan
epoch: 10,	 loss:1166346.125,	 grad_norm: nan
epoch: 11,	 loss:1166346.125,	 grad_norm: nan
epoch: 12,	 loss:1166346.125,	 grad_norm: nan
epoch: 13,	 loss:1166346.125,	 grad_norm: nan
epoch: 14,	 loss:1166346.125,	 grad_norm: nan
epoch: 15,	 loss:1166346.125,	 grad_norm: nan
epoch: 16,	 loss:1166346.125,	 grad_norm: nan
epoch: 17,	 loss:1166346.125,	 grad_norm: nan
epoch: 18,	 loss:1166346.125,	 grad_norm: nan
epoch: 19,	 loss:1166346.125,	 grad_norm: nan


# Part D: AMP Integration with Normalized Data

In [205]:
data_norm = ((data_tensor - data_tensor.mean(dim=0)) / data_tensor.std(dim=0)).to("cuda")

model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_norm, target)
    print(f"epoch: {epoch},\t loss:{loss:.5f},\t grad_norm: {get_grad_norm(model.parameters()):.4f},\t scale: {trainer.scaler.get_scale()}")

epoch: 0,	 loss:2.09069,	 grad_norm: nan,	 scale: 32768.0
epoch: 1,	 loss:2.09069,	 grad_norm: nan,	 scale: 16384.0
epoch: 2,	 loss:2.09069,	 grad_norm: 1.1570,	 scale: 16384.0
epoch: 3,	 loss:1.53029,	 grad_norm: 1.1148,	 scale: 16384.0
epoch: 4,	 loss:1.07081,	 grad_norm: 1.0763,	 scale: 16384.0
epoch: 5,	 loss:0.69637,	 grad_norm: 1.0612,	 scale: 16384.0
epoch: 6,	 loss:0.40266,	 grad_norm: 1.0597,	 scale: 16384.0
epoch: 7,	 loss:0.18890,	 grad_norm: 0.9879,	 scale: 16384.0
epoch: 8,	 loss:0.06804,	 grad_norm: 0.7653,	 scale: 16384.0
epoch: 9,	 loss:0.02467,	 grad_norm: 0.5935,	 scale: 16384.0
epoch: 10,	 loss:0.00888,	 grad_norm: 0.4589,	 scale: 16384.0
epoch: 11,	 loss:0.00328,	 grad_norm: 0.3557,	 scale: 16384.0
epoch: 12,	 loss:0.00128,	 grad_norm: 0.2768,	 scale: 16384.0
epoch: 13,	 loss:0.00053,	 grad_norm: 0.2138,	 scale: 16384.0
epoch: 14,	 loss:0.00027,	 grad_norm: 0.1676,	 scale: 16384.0
epoch: 15,	 loss:0.00017,	 grad_norm: 0.1308,	 scale: 16384.0
epoch: 16,	 loss:0.00014