In [1]:
import math
import copy
import os
import time
import enum
import argparse
import polars as pl
# Visualization related imports
import matplotlib.pyplot as plt
import seaborn

# Deep learning related imports
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from torch.hub import download_url_to_file

# Data manipulation related imports
# from torchtext.data import Dataset, BucketIterator, Field, Example
import spacy
import seaborn as sns

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data1 = pl.read_parquet('data/sampling_data.parquet')
data1.shape

(1543873, 150)

In [4]:
from DTN_model import DTN_model
from utils import *

In [5]:
model_dimension = 256
number_of_heads = 8
number_of_var = 147
number_of_layers = 6
dropout_probability = 0.1
hidden_dimension_list = [512,256,64,2]

In [6]:
MyModel = DTN_model(model_dimension, number_of_heads, number_of_layers, dropout_probability, number_of_var, hidden_dimension_list)

In [7]:
processor = processor(['cycle','D24_int'], 500, 0)
battery_data = BatteryData('data/sampling_data.parquet', processor)
dataloader = DataLoader(dataset=battery_data,batch_size=8,shuffle=False,num_workers=2)

In [8]:
for batch_idx, data_batch in enumerate(dataloader):
    print(batch_idx)
    X, y = data_batch
    break

0


In [9]:
src_mask = MyModel.get_key_padding_mask(X)

In [14]:
res = MyModel(X.float(), src_mask)
print(res.shape)

torch.Size([8, 500, 256])
torch.Size([8, 500, 2])


In [11]:
torch.exp(res[0,0,:])

tensor([0.0018, 0.0020], grad_fn=<ExpBackward0>)

In [25]:
tmp = torch.ones(y.shape)

In [None]:
y2 = tmp-y
y = y.unsqueeze(dim=2)
y2 = y2.unsqueeze(dim=2)
new_y = torch.cat([y,y2],dim=2)

In [19]:
x = torch.randint(0, 2, [1, 3, 1])
print(x.shape)
x.expand(2, 3, 4).shape

torch.Size([1, 3, 1])


torch.Size([2, 3, 4])

In [None]:
class CustomLRAdamOptimizer:
    """
        Linear ramp learning rate for the warm-up number of steps and then start decaying
        according to the inverse square root law of the current training step number.

        Check out playground.py for visualization of the learning rate (visualize_custom_lr_adam).
    """

    def __init__(self, optimizer, model_dimension, num_of_warmup_steps):
        self.optimizer = optimizer
        self.model_size = model_dimension
        self.num_of_warmup_steps = num_of_warmup_steps

        self.current_step_number = 0

    def step(self):
        self.current_step_number += 1
        current_learning_rate = self.get_current_learning_rate()

        for p in self.optimizer.param_groups:
            p['lr'] = current_learning_rate

        self.optimizer.step()  # apply gradients

    # Check out the formula at Page 7, Chapter 5.3 "Optimizer" and playground.py for visualization
    def get_current_learning_rate(self):
        # For readability purpose
        step = self.current_step_number
        warmup = self.num_of_warmup_steps

        return self.model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))

    def zero_grad(self):
        self.optimizer.zero_grad()

In [14]:
kl_div_loss = nn.KLDivLoss(reduction='batchmean')

In [31]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        tmp = torch.ones(y.shape)
        y2 = tmp-y
        y = y.unsqueeze(dim=2)
        y2 = y2.unsqueeze(dim=2)
        new_y = torch.cat([y,y2],dim=2)
        pred = model(X.float(), model.get_key_padding_mask(X).float())
        loss = loss_fn(pred, new_y)

        # Backpropagation
        loss.backward()
        for name, param in MyModel.named_parameters():
            if param.requires_grad and param.grad is not None:
                grads[name] = param.grad
                print(name)
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [40]:
kl_div_loss = nn.KLDivLoss(reduction='batchmean') 
loss = nn.CrossEntropyLoss()

In [38]:
optimizer = Adam(MyModel.parameters(),lr=0.00001, betas=(0.9, 0.98), eps=1e-9)

In [41]:
train_loop(dataloader,MyModel,loss,optimizer)

torch.Size([8, 500, 256])
loss: 1553.652048  [    8/ 4689]
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch.Size([8, 500, 256])
torch

KeyboardInterrupt: 

In [43]:
grads = {}
for name, param in MyModel.named_parameters():
    if param.requires_grad and param.grad is not None:
        grads[name] = param.grad
        print(name)

In [49]:
MyModel.named_parameters()

TypeError: 'generator' object is not subscriptable

In [47]:
for name, param in MyModel.named_parameters():
    if param.requires_grad:
        print(name)

encoder.layers.0.self_attn.in_proj_weight
Parameter containing:
tensor([[-0.0613, -0.0499,  0.0660,  ..., -0.0167,  0.0196,  0.0649],
        [ 0.0720, -0.0265, -0.0006,  ...,  0.0625,  0.0189, -0.0606],
        [-0.0776,  0.0046, -0.0683,  ..., -0.0514, -0.0284, -0.0500],
        ...,
        [ 0.0119, -0.0774, -0.0541,  ...,  0.0177, -0.0468, -0.0389],
        [-0.0117,  0.0650,  0.0629,  ...,  0.0553, -0.0410, -0.0227],
        [-0.0662, -0.0364, -0.0158,  ..., -0.0436,  0.0734,  0.0549]],
       requires_grad=True)
encoder.layers.0.self_attn.in_proj_bias
Parameter containing:
tensor([ 6.2636e-03,  6.2664e-03,  6.4567e-03,  6.2602e-03,  6.2358e-03,
        -6.2619e-03, -6.2814e-03, -6.2728e-03,  6.2460e-03,  6.2604e-03,
         6.6006e-03, -6.2635e-03, -6.2393e-03, -6.2552e-03, -6.2866e-03,
         6.2459e-03, -6.2666e-03, -6.2800e-03, -6.2712e-03, -6.2581e-03,
        -6.3332e-03,  6.2618e-03, -6.2602e-03,  6.2747e-03, -6.2568e-03,
        -6.2596e-03, -6.1983e-03,  6.2835e-03, -