# Part A: Trainer Class

In [220]:
import torch
import torch.nn as nn

class Trainer:
    def __init__(self, model, optimizer, criterion, device):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

# Part B: Gradiant Clipping

In [221]:
import pandas as pd

def get_grad_norm(parameters):
    total_norm = 0.0
    for p in parameters:
        if p.grad is not None:
            total_norm += p.grad.norm(2)
    
    total_norm = total_norm ** 0.5
    return total_norm.item()


data_df = pd.read_csv("../data/BTCUSDT.csv", sep='|', nrows=1000, header=None, usecols=[1, 2, 3, 4, 5],
                       names=['open', 'high', 'low', 'close', 'volume'])
data_tensor = torch.tensor(data_df.values, dtype=torch.float32).to('cuda')
target = torch.tensor(torch.ones((1000, 1)), dtype=torch.float32).to('cuda') 


model = nn.Linear(5, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.000001)
criterion = nn.MSELoss()


trainer = Trainer(model, optimizer, criterion, "cuda")

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch}, loss:{loss}, grad_norm: {get_grad_norm(model.parameters())}")

epoch: 0, loss:3517725.5, grad_norm: 5721.951171875
epoch: 1, loss:80558546944.0, grad_norm: 70389.296875
epoch: 2, loss:1844851662389248.0, grad_norm: 865902.6875
epoch: 3, loss:4.224850999692073e+19, grad_norm: 10652010.0
epoch: 4, loss:9.675229356395934e+23, grad_norm: 131037024.0
epoch: 5, loss:2.2157012971970574e+28, grad_norm: 1611968384.0
epoch: 6, loss:5.07412356661216e+32, grad_norm: inf
epoch: 7, loss:inf, grad_norm: inf
epoch: 8, loss:inf, grad_norm: inf
epoch: 9, loss:inf, grad_norm: inf
epoch: 10, loss:inf, grad_norm: inf
epoch: 11, loss:inf, grad_norm: inf
epoch: 12, loss:inf, grad_norm: inf
epoch: 13, loss:inf, grad_norm: inf
epoch: 14, loss:inf, grad_norm: inf
epoch: 15, loss:inf, grad_norm: inf
epoch: 16, loss:inf, grad_norm: nan
epoch: 17, loss:nan, grad_norm: nan
epoch: 18, loss:nan, grad_norm: nan
epoch: 19, loss:nan, grad_norm: nan


  target = torch.tensor(torch.ones((1000, 1)), dtype=torch.float32).to('cuda')


In [222]:
class Trainer:
    def __init__(self, model, optimizer, criterion, device, max_grad_norm=1.0):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_grad_norm = max_grad_norm

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()
        y_pred = self.model(x)
        loss = self.criterion(y_pred, y)
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
        self.optimizer.step()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

In [223]:
model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch},\t loss:{loss:.3f},\t grad_norm: {get_grad_norm(model.parameters()):.3f}")

epoch: 0,	 loss:894990.375,	 grad_norm: 1.000
epoch: 1,	 loss:737480.500,	 grad_norm: 1.000
epoch: 2,	 loss:595203.625,	 grad_norm: 1.000
epoch: 3,	 loss:468159.656,	 grad_norm: 1.000
epoch: 4,	 loss:356348.719,	 grad_norm: 1.000
epoch: 5,	 loss:259770.812,	 grad_norm: 1.000
epoch: 6,	 loss:178425.859,	 grad_norm: 1.000
epoch: 7,	 loss:112313.922,	 grad_norm: 1.000
epoch: 8,	 loss:61435.051,	 grad_norm: 1.000
epoch: 9,	 loss:25789.127,	 grad_norm: 1.000
epoch: 10,	 loss:5376.165,	 grad_norm: 1.000
epoch: 11,	 loss:196.212,	 grad_norm: 1.000
epoch: 12,	 loss:5376.155,	 grad_norm: 1.000
epoch: 13,	 loss:196.214,	 grad_norm: 1.000
epoch: 14,	 loss:5376.146,	 grad_norm: 1.000
epoch: 15,	 loss:196.216,	 grad_norm: 1.000
epoch: 16,	 loss:5376.135,	 grad_norm: 1.000
epoch: 17,	 loss:196.218,	 grad_norm: 1.000
epoch: 18,	 loss:5376.126,	 grad_norm: 1.000
epoch: 19,	 loss:196.219,	 grad_norm: 1.000


# Part C: Mixed Precision

In [224]:
class Trainer:
    def __init__(self, model, optimizer, criterion, device, max_grad_norm=1.0):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_grad_norm = max_grad_norm

        self.scaler = torch.amp.GradScaler()

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()

        with torch.autocast(device_type=self.device, dtype=torch.float16):
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)

        self.scaler.scale(loss).backward()
        self.scaler.unscale_(self.optimizer)
        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
        self.scaler.step(self.optimizer)
        self.scaler.update()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()

In [225]:
model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_tensor, target)
    print(f"epoch: {epoch},\t loss:{loss:.3f},\t grad_norm: {get_grad_norm(model.parameters()):.3f}")

epoch: 0,	 loss:14492333.000,	 grad_norm: nan
epoch: 1,	 loss:14492333.000,	 grad_norm: nan
epoch: 2,	 loss:14492333.000,	 grad_norm: nan
epoch: 3,	 loss:14492333.000,	 grad_norm: nan
epoch: 4,	 loss:14492333.000,	 grad_norm: nan
epoch: 5,	 loss:14492333.000,	 grad_norm: nan
epoch: 6,	 loss:14492333.000,	 grad_norm: nan
epoch: 7,	 loss:14492333.000,	 grad_norm: nan
epoch: 8,	 loss:14492333.000,	 grad_norm: nan
epoch: 9,	 loss:14492333.000,	 grad_norm: nan
epoch: 10,	 loss:14492333.000,	 grad_norm: nan
epoch: 11,	 loss:14492333.000,	 grad_norm: nan
epoch: 12,	 loss:14492333.000,	 grad_norm: nan
epoch: 13,	 loss:14492333.000,	 grad_norm: nan
epoch: 14,	 loss:14492333.000,	 grad_norm: nan
epoch: 15,	 loss:14492333.000,	 grad_norm: nan
epoch: 16,	 loss:14492333.000,	 grad_norm: nan
epoch: 17,	 loss:14492333.000,	 grad_norm: nan
epoch: 18,	 loss:14492333.000,	 grad_norm: nan
epoch: 19,	 loss:14492333.000,	 grad_norm: nan


# Part D: AMP Integration with Normalized Data

In [226]:
data_norm = ((data_tensor - data_tensor.mean(dim=0)) / data_tensor.std(dim=0)).to("cuda")

model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_norm, target)
    print(f"epoch: {epoch},\t loss:{loss:.5f},\t grad_norm: {get_grad_norm(model.parameters()):.4f},\t scale: {trainer.scaler.get_scale()}")

epoch: 0,	 loss:3.21218,	 grad_norm: nan,	 scale: 32768.0
epoch: 1,	 loss:3.21218,	 grad_norm: nan,	 scale: 16384.0
epoch: 2,	 loss:3.21218,	 grad_norm: 1.1789,	 scale: 16384.0
epoch: 3,	 loss:2.31021,	 grad_norm: 1.1886,	 scale: 16384.0
epoch: 4,	 loss:1.62655,	 grad_norm: 1.1818,	 scale: 16384.0
epoch: 5,	 loss:1.11609,	 grad_norm: 1.1386,	 scale: 16384.0
epoch: 6,	 loss:0.72702,	 grad_norm: 1.0684,	 scale: 16384.0
epoch: 7,	 loss:0.42569,	 grad_norm: 1.0387,	 scale: 16384.0
epoch: 8,	 loss:0.20506,	 grad_norm: 0.9881,	 scale: 16384.0
epoch: 9,	 loss:0.07380,	 grad_norm: 0.7652,	 scale: 16384.0
epoch: 10,	 loss:0.02659,	 grad_norm: 0.5925,	 scale: 16384.0
epoch: 11,	 loss:0.00973,	 grad_norm: 0.4599,	 scale: 16384.0
epoch: 12,	 loss:0.00359,	 grad_norm: 0.3565,	 scale: 16384.0
epoch: 13,	 loss:0.00138,	 grad_norm: 0.2764,	 scale: 16384.0
epoch: 14,	 loss:0.00058,	 grad_norm: 0.2142,	 scale: 16384.0
epoch: 15,	 loss:0.00028,	 grad_norm: 0.1645,	 scale: 16384.0
epoch: 16,	 loss:0.00019

# Part E: Checkpointing System

In [227]:
class Trainer:
    def __init__(self, model, optimizer, criterion, device, max_grad_norm=1.0):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_grad_norm = max_grad_norm

        self.scaler = torch.amp.GradScaler()

    def train_step(self, x, y):
        self.model.train()
        self.optimizer.zero_grad()

        with torch.autocast(device_type=self.device, dtype=torch.float16):
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)

        self.scaler.scale(loss).backward()
        self.scaler.unscale_(self.optimizer)
        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
        self.scaler.step(self.optimizer)
        self.scaler.update()
        return loss.item()

    def eval_step(self, x, y):
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(x)
            loss = self.criterion(y_pred, y)
        return loss.item()
    
    def save_checkpoint(self, filepath):
        state = dict()
        state['model_state'] = self.model.state_dict()
        state['optimizer_state'] = self.optimizer.state_dict()
        state['scaler_state'] = self.scaler.state_dict()
        torch.save(state, filepath)

    def load_checkpoint(self, filepath):
        state = torch.load(filepath, weights_only=True)
        self.model.load_state_dict(state['model_state'])
        self.optimizer.load_state_dict(state['optimizer_state'])
        self.scaler.load_state_dict(state['scaler_state'])

In [228]:
model.reset_parameters()
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

trainer = Trainer(model, optimizer, criterion, "cuda", max_grad_norm=1)

epochs = 20
for epoch in range(epochs):
    loss = trainer.train_step(data_norm, target)
    print(f"epoch: {epoch},\t loss:{loss:.5f},\t grad_norm: {get_grad_norm(model.parameters()):.4f},\t scale: {trainer.scaler.get_scale()}")

trainer.save_checkpoint('test_model.pth')

epoch: 0,	 loss:1.96366,	 grad_norm: nan,	 scale: 32768.0
epoch: 1,	 loss:1.96366,	 grad_norm: nan,	 scale: 16384.0
epoch: 2,	 loss:1.96366,	 grad_norm: 1.1882,	 scale: 16384.0
epoch: 3,	 loss:1.37341,	 grad_norm: 1.1647,	 scale: 16384.0
epoch: 4,	 loss:0.92736,	 grad_norm: 1.1062,	 scale: 16384.0
epoch: 5,	 loss:0.58079,	 grad_norm: 1.0607,	 scale: 16384.0
epoch: 6,	 loss:0.31622,	 grad_norm: 1.0559,	 scale: 16384.0
epoch: 7,	 loss:0.13184,	 grad_norm: 0.8981,	 scale: 16384.0
epoch: 8,	 loss:0.04815,	 grad_norm: 0.6959,	 scale: 16384.0
epoch: 9,	 loss:0.01793,	 grad_norm: 0.5388,	 scale: 16384.0
epoch: 10,	 loss:0.00717,	 grad_norm: 0.4185,	 scale: 16384.0
epoch: 11,	 loss:0.00324,	 grad_norm: 0.3245,	 scale: 16384.0
epoch: 12,	 loss:0.00183,	 grad_norm: 0.2530,	 scale: 16384.0
epoch: 13,	 loss:0.00132,	 grad_norm: 0.1985,	 scale: 16384.0
epoch: 14,	 loss:0.00114,	 grad_norm: 0.1600,	 scale: 16384.0
epoch: 15,	 loss:0.00107,	 grad_norm: 0.1282,	 scale: 16384.0
epoch: 16,	 loss:0.00105

In [229]:
trainer.load_checkpoint('test_model.pth')
trainer.eval_step(data_norm, target)

0.0010149055160582066