In [1]:
# Settings for autoreloading

%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random
import torch

def set_random_seed(random_seed: int = 42) -> None:
    """Set the random seed for reproducibility. The seed is set for the random library, the numpy library and the pytorch 
    library. Moreover the environment variable `TF_DETERMINISTIC_OPS` is set as "1".

    Parameters
    ----------
    random_seed : int, optional
        The random seed to use for reproducibility (default 42).
    """
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed()

  from .autonotebook import tqdm as notebook_tqdm


https://github.com/LMissher/STGNN/blob/main/model.py

In [3]:
import torch

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
print(DEVICE)

cuda


In [5]:
import os
from torch_geometric_temporal.dataset.pems_bay import PemsBayDatasetLoader
ds = PemsBayDatasetLoader(os.path.join('..', 'data'))

In [6]:
import numpy as np
node_values = np.load('../data/pems_node_values.npy')

In [7]:
print(node_values.shape)

(52105, 325, 2)


In [8]:
import numpy as np
adj = np.load('../data/pems_adj_mat.npy')

In [9]:
print(adj.shape)

(325, 325)


In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(node_values, test_size=.2, shuffle=False)

In [11]:
print(x_train.shape, x_test.shape)

(41684, 325, 2) (10421, 325, 2)


In [12]:
T = 12

In [13]:
def window_stack(a, stepsize=T):
    
    n_instances, n_nodes, n_features = a.shape
    resulting_batches = n_instances // stepsize
    return np.stack(
        [a[i * stepsize : i * stepsize + stepsize].reshape(n_nodes, n_features, stepsize)
         for i in range(resulting_batches)] )
#slided_x_train = sliding_window_view(x_train, 12, axis=0)
#x_train, y_train = np.array([x for x in x_train[::T]]), np.array([x for x in x_train[T::T]])
x_train = window_stack(x_train)
x_test = window_stack(x_test)

In [14]:
print(x_train.shape)

(3473, 325, 2, 12)


In [15]:
x_train, y_train = np.array([x for x in x_train[::2]]), np.array([x for x in x_train[1::2]])
x_test, y_test = np.array([x for x in x_test[::2]]), np.array([x for x in x_test[1::2]])

In [16]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
x_train = x_train[:-1]
print(x_train.shape, y_train.shape)

(1737, 325, 2, 12) (1736, 325, 2, 12)
(434, 325, 2, 12) (434, 325, 2, 12)
(1736, 325, 2, 12) (1736, 325, 2, 12)


In [17]:

import math
import torch
import torch.nn as nn
from typing import Tuple

class S_GNN(nn.Module):
    def __init__(self, input_dim: int, A_hat: torch.Tensor, hidden_dim: int = 64) -> None:
        super().__init__()
        # Module to obtain the latent representation of the input.
        # TODO: check if the latent representation is ok
        self.latent_encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            #nn.Linear(hidden_dim, input_dim)
        )
        # Linear layer to model the spatial feature extraction.
        self.linear = nn.Linear(input_dim, input_dim)
        self.A_hat = A_hat
    
    def forward(self, x: torch.Tensor):
        # Get the latent representation of the input.
        p = self.latent_encoder(x)

        # Apply score function.
        score = p @ p.transpose(-1, -2)
        # Pair-wise relation between any road node. Note, for stability of exp see https://effectivemachinelearning.com/PyTorch/7._Numerical_stability_in_PyTorch
        # TODO: Are the phi in the
        #print('exp', torch.relu(score).exp())
        score = torch.sigmoid(score)
        exp = torch.exp(score - torch.max(score))
        R = exp / exp.sum()
        
        #R = torch.relu(score).exp() / torch.relu(score).exp().sum()
        # TODO: A_hat should probably be provided by the model since it is just a refined adjacency matrix, unless we pass the adjacency matrix as an input
        #A_hat = p + torch.eye(p.shape[0], p.shape[1], device=x.device)
        # Get refined adjacency matrix: A_hat = A + I
        #A_hat = torch.rand(x.shape[0], x.shape[0], device=DEVICE) #R # TODO: CHANGE
        # Get the sparsified relation matrix
        A_hat = self.A_hat.expand_as(R)
        #print(A_hat.shape)
        #print(R.shape)
        R_hat = R * (A_hat > 0).float() 
        R_hat += torch.eye(R_hat.shape[-2], R_hat.shape[-1], device=x.device)
        #print('R_hat', R_hat.shape)
        # R_hat = torch.mul((self.A_hat > 0).float(), R) + torch.eye(R.shape[0], R.shape[1], device=x.device) #torch.eye(p.shape[0], p.shape[1], device=x.device)
        # Get refined degree matrix for R_hat
        D_hat = (R_hat.sum(-1) ** -.5)
        #D_hat[torch.isinf(D_hat)] = 0.
        D_hat = torch.diag_embed(D_hat)
        #print('D-hat', D_hat)

        # TODO: handle infinities and nones
        A = D_hat @ R_hat @ D_hat

        out = torch.relu(self.linear(A @ x))
        #print(A, x)
        return out

'''class GRU(nn.Module):
    def __init__(self, n_input_features: int, n_hidden_state_features: int) -> None:
        super().__init__()
        # Update gate layers.
        self.z_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.z_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)
        # Reset gate layers.
        self.r_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.r_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)
        # State gate layers.
        self.h_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.h_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)

    def forward(self, x: torch.Tensor, h: torch.Tensor):
        # Update Gate.
        z_x = self.z_x_linear(x)
        z_h = self.z_h_linear(h)
        z_t = torch.sigmoid(z_x + z_h)
        
        # Reset Gate.
        r_x = self.r_x_linear(x)
        r_h = self.r_h_linear(h)
        r_t = torch.sigmoid(r_x + r_h)
        
        # State gate.
        h_x = self.h_x_linear(x)
        h_h = self.h_h_linear(h)
        h_t = torch.tanh(h_x + r_t * h_h)
        
        # Get GRU output.
        out = (1 - z_t) * h_t + z_t * h
        return out'''
        
class GRU(nn.Module):
    def __init__(self, n_input_features: int, n_hidden_state_features: int) -> None:
        super().__init__()
        # Update gate layers.
        self.z_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.z_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)
        # Reset gate layers.
        self.r_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.r_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)
        # State gate layers.
        self.h_x_linear = nn.Linear(n_input_features, n_hidden_state_features)
        self.h_h_linear = nn.Linear(n_hidden_state_features, n_hidden_state_features)
        
        #self.out = nn.Linear(n_hidden_state_features, n_input_features)

    def forward(self, x: torch.Tensor, h: torch.Tensor):
        # Update Gate.
        z_x = self.z_x_linear(x)
        z_h = self.z_h_linear(h)
        z_t = torch.sigmoid(z_x + z_h)
        
        # Reset Gate.
        r_x = self.r_x_linear(x)
        r_h = self.r_h_linear(h)
        r_t = torch.sigmoid(r_x + r_h)
        
        # State gate.
        h_x = self.h_x_linear(x)
        h_h = self.h_h_linear(h)
        h_t = torch.tanh(h_x + r_t * h_h)
        
        # Get GRU output.
        out = (1 - z_t) * h_t + z_t * h
        return out

class Transformer(nn.Module):
    def __init__(self, n_nodes: int, len_timeseries: int, hidden_dimension: int, n_heads: int = 4) -> None:
        super().__init__()
        #_, n_features = input_size
        self.queries_linear = nn.Linear(len_timeseries, len_timeseries)
        self.keys_linear = nn.Linear(len_timeseries, len_timeseries)
        self.values_linear = nn.Linear(len_timeseries, len_timeseries)
        #self.multi_head_attention = nn.MultiheadAttention(n_features, n_heads)
        self.multi_head_attention_list = nn.ModuleList([nn.MultiheadAttention(len_timeseries, n_heads) for _ in range(n_nodes)])
        self.normalization = nn.LayerNorm(len_timeseries)
        self.normalization_out = nn.LayerNorm(len_timeseries)
        self.feed_forward = nn.Sequential(
            nn.Linear(len_timeseries, hidden_dimension),
            nn.ReLU(),
            nn.Linear(hidden_dimension, len_timeseries)
        )
        
    #def _attention(Q, K, V):
    #    d = K.shape[-1]
    #    return torch.softmax(((Q @ K.transpose(1, 0)) / d ** .5) @ V)
        
    def forward(self, x: torch.Tensor):
        # Get queries, keys and values.
        #Q = x.clone() #self.queries_linear(x)
        #K = x.clone() #self.keys_linear(x)
        #V = x.clone() #self.values_linear(x)
        Q = self.queries_linear(x)
        K = self.keys_linear(x)
        V = self.values_linear(x)
        
        # Multi head attention mechanism.
        out = []
        for i in range(x.shape[1]):
            out_, _ = self.multi_head_attention_list[i](Q[:, i], K[:, i], V[:, i])
            out.append(out_)
        out = torch.stack(out, 1)
        
        # Apply residual connection and batch normalization.
        out += x
        norm = self.normalization(out)
        
        # Apply feed forward module.
        out = self.feed_forward(norm)
        
        # Apply residual connection and batch normalization.
        out += norm
        return self.normalization_out(out)
    
class PositionalEncoding(nn.Module):
    def __init__(self, n_features: int, len_timeseries: int) -> None:
        super().__init__()
        pe = torch.zeros(n_features, len_timeseries)
        
        position = torch.arange(len_timeseries)#.unsqueeze(1)
        div_term = torch.exp(torch.arange(n_features) * (-math.log(10000.0) / n_features)).unsqueeze(1)
        pe[:, 0::2] = torch.sin(position * div_term)[:, 0::2]
        pe[:, 1::2] = torch.cos(position * div_term)[:, 1::2]
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + torch.autograd.Variable(self.pe, requires_grad=False)

In [18]:
class SpatioTemporalGNN(nn.Module):
    def __init__(self, n_nodes, n_features, len_timeseries, adj: np.ndarray, device: str, hidden_dimension = 64):
        super().__init__()
        adj = torch.tensor(adj, dtype=torch.float32, requires_grad=False, device=device)
        A_hat = adj + torch.eye(adj.shape[0], adj.shape[1], device=device)
        # self.start_emb = nn.Linear(infea, outfea)
        # self.end_emb = nn.Linear(outfea, infea)
        self.s_gnns = nn.ModuleList([S_GNN(n_features, A_hat) for _ in range(len_timeseries)])
        self.hidden_s_gnns = nn.ModuleList([S_GNN(hidden_dimension, A_hat) for _ in range(len_timeseries -1)])
        self.grus = nn.ModuleList([GRU(n_features, hidden_dimension) for _ in range(len_timeseries)])
        self.decoders = nn.ModuleList([nn.Linear(hidden_dimension, n_features) for _ in range(len_timeseries)])
        self.positional_encoding = PositionalEncoding(n_features, len_timeseries)
        #self.transformers = nn.ModuleList([Transformer(n_nodes, hidden_dimension, hidden_dimension=hidden_dimension) for _ in range(len_timeseries)])
        self.transformer = Transformer(n_nodes, len_timeseries, hidden_dimension=hidden_dimension)
        
        self.prediction_layer = nn.Sequential(
            nn.Linear(len_timeseries, len_timeseries),
            nn.ReLU(),
            nn.Linear(len_timeseries, len_timeseries)
            #nn.Upsample(size=(1, output_graphs, n_nodes, n_features))
        )
        self.n_nodes = n_nodes
        self.n_features = n_features
        self.len_timeseries = len_timeseries
        self.hidden_dimension = hidden_dimension
        self.to(device)

    def forward(self, x):
        # x = x.unsqueeze(-1)
        # x = self.start_emb(x)
        sgnn_outs = []
        for i in range(self.len_timeseries):
            timestamp = x[:, :, :, i]
            #print(timestamp.shape)
            x_ = self.s_gnns[i](timestamp)
            if i > 0:
                hidden_state = sgnn_outs[i-1] 
            else:
                batch_size, n_nodes, n_features, _ = x.shape
                hidden_state = torch.zeros(
                    (batch_size, n_nodes, self.hidden_dimension),
                    device=DEVICE)
            x_ = self.grus[i](x_, hidden_state)
            sgnn_outs.append(x_)
            if i < self.len_timeseries:
                self.hidden_s_gnns[i-1](x_)
        
        #TODO: see how to handle this 
        sgnn_outs = [self.decoders[i](x_) for i, x_ in enumerate(sgnn_outs)]
                
        #x = self.positional_encoding(x)
        
        # TODO: stack row-wise and pass to the transformer
        #print(sgnn_outs[0].shape)
        # Stack the GRU outputs row-wise for each node
        out = torch.stack(sgnn_outs, -1)
        #print(out.shape)
        out = self.positional_encoding(out)
        # TODO: The weights of the transformer seem to be shared, pass just subsets of nodes to a single layer.
        #transformer_outs = []
        #for i in range(self.len_timeseries):
        #    transformer_outs.append(self.transformers[i](out[:, i]))
        #out = torch.stack(transformer_outs, 1)
        out = self.transformer(out)
        #print(out.shape)
        
        out = self.prediction_layer(out)
        #print('in shape', x.shape)
        #print('out shape', out.shape)
        #print(out.shape)
        #print(out.squeeze(-1).shape)
        #print(out.shape)
        return out

In [19]:
#print(x_train.shape)

In [20]:
model = SpatioTemporalGNN(325, 2, T, adj, DEVICE) #.to(DEVICE) #STGNN(2, 4*16, 1, 16).to(device) #SpatioTemporalGNN(325, 2, T, adj, DEVICE)#.to(DEVICE)

In [21]:
model

SpatioTemporalGNN(
  (s_gnns): ModuleList(
    (0): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=2, out_features=64, bias=True)
      )
      (linear): Linear(in_features=2, out_features=2, bias=True)
    )
    (1): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=2, out_features=64, bias=True)
      )
      (linear): Linear(in_features=2, out_features=2, bias=True)
    )
    (2): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=2, out_features=64, bias=True)
      )
      (linear): Linear(in_features=2, out_features=2, bias=True)
    )
    (3): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=2, out_features=64, bias=True)
      )
      (linear): Linear(in_features=2, out_features=2, bias=True)
    )
    (4): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=2, out_features=64, bias=True)
      )
      (linear): Linear(in_features=2, out_features=2, bias

In [22]:
from torch.utils.data.dataloader import DataLoader, Dataset, T_co

class TimeSeriesDataset(Dataset):
    def __init__(self, x: np.ndarray, y: np.ndarray) -> None:
        self.x = x
        self.y = y
        self.len = x.shape[0]
    def __getitem__(self, index) -> T_co:
        return self.x[index], self.y[index]
    def __len__(self) -> int:
        return self.len

train_set = TimeSeriesDataset(x_train, y_train)
#test_set = TimeSeriesDataset(x_test, y_test)

train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)

In [23]:
#model(next(iter(train_dataloader))[0].float().to(DEVICE))

In [24]:
from torchviz import make_dot

def visualize_network(model: SpatioTemporalGNN, dataloader: DataLoader, device: str):
    torch.cuda.empty_cache()
    x, y = next(iter(dataloader))
    x = x.type(torch.float32).to(device=device)
    #y = y.type(torch.float32).to(device=device)
    y_hat = model(x)
    make_dot(y_hat.mean(), params=dict(model.named_modules())).render('Spatial Temporal GNN', format='png')
    
    torch.cuda.empty_cache()
    #plt.show()

In [25]:
#model.named_modules()

In [26]:
torch.cuda.empty_cache()
#visualize_network(model, train_dataloader, DEVICE)

In [27]:
from time import time
from typing import Callable

# create a nn class (just-for-fun choice :-) 
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        r = torch.sqrt(self.mse(yhat,y))
        #print(r)
        return(r)

#criterion = RMSELoss()
# loss = criterion(yhat,y)

def loss(y_pred, y_true):
    # TODO: transform that in Mean absolute error per timeseries, dim=...
    l = torch.linalg.norm(y_pred - y_true, ord=1, dim=-2)
    return torch.mean(l)

def train(train_dataloader: DataLoader, model: SpatioTemporalGNN, 
          optimizer: torch.optim.Optimizer, lr_scheduler, loss_function: Callable, device: str, epochs: int = 10) -> None:
          #steps_validate: int = 100, checkpoint: Optional[Checkpoint] = None, early_stopping: Optional[EarlyStopping] = None, 
          #reload_best_weights: bool = True) -> None:
    train_loss_history = []
    val_loss_history = []
    val_f1_macro_history = []
    
    rmse_loss = RMSELoss()

    # Total steps to perform
    # tot_steps = len(train_dataloader) * epochs
    # Number of step already done
    n_steps = 0
    
    model.train()

    # Iterate across the epochs
    for epoch in range(epochs):
        # Set up display element
        #disp = display('', display_id=True)

        # Remove unused tensors from gpu memory
        torch.cuda.empty_cache()

        # Initialize running losses
        running_loss = 0.0
        running_rmse = 0.0
        
        optimizer.zero_grad()

        start_time = time()

        # Number of batches for the current update step
        # batch_steps = 0

        for batch_idx, data in enumerate(train_dataloader, 0):
            # Increment the number of batch steps
            batch_steps = batch_idx + 1

            # Get the data
            x, y = data
            x = x.type(torch.float32).to(device=device)
            y = y.type(torch.float32).to(device=device)

            # Compute output
            output = model(x)
            
            # Loss
            loss = loss_function(output, y)
            rmse = rmse_loss(output, y)
            running_loss += loss.item()
            running_rmse += rmse.item()

            optimizer.zero_grad()
            #nb_tr_steps += 1
            loss.backward()
            
            # When using GPU
            optimizer.step()

            # Evaluate on validation set
            '''if batch_idx % steps_validate == steps_validate - 1:
                model.eval()
                torch.cuda.empty_cache()

                # Compute both the token importances validation loss and the answer generation validation loss
                val_loss, val_f1_macro = _loss_validate(model, val_dataloader, loss_function, device)
                
                # Update validation loss history
                val_loss_history.append([n_steps, val_loss.item()])
                val_f1_macro_history.append([n_steps, val_f1_macro])

                torch.cuda.empty_cache()
                
                if checkpoint is not None:
                    checkpoint.save_best(val_f1_macro, train_loss_history=train_loss_history,
                                         val_loss_history=val_loss_history, val_f1_macro_history=val_f1_macro_history)
                if early_stopping is not None:
                    early_stopping.update(val_f1_macro)
                    if early_stopping.is_stop_condition_met():
                        print('Early stopping')
                        return train_loss_history, val_loss_history, val_f1_macro_history

                model.train()'''


            # Update training history and print           
            train_loss_history.append(loss.detach().cpu())
            
            epoch_time = time() - start_time
            batch_time = epoch_time / batch_steps
            
            # TODO: function to print batch string
            print(
                f'epoch: {epoch + 1}/{epochs}',
                f'[{batch_steps}/{len(train_dataloader)}],',
                f'{epoch_time:.0f}s {batch_time * 1e3:.0f}ms/step,',
                #f'lr base: {optimizer.param_groups[0]["lr"]:.3g} lr head: {optimizer.param_groups[1]["lr"]:.3g}, ' +
                f'loss: {running_loss / batch_steps:.3g},',
                f'RMSE: {running_rmse / batch_steps:.3g},',
                f'lr: {optimizer.param_groups[0]["lr"]:.3g} weight decay: {optimizer.param_groups[0]["weight_decay"]}',
                '               ' if batch_steps < len(train_dataloader) else '',
                end='\r' if batch_steps < len(train_dataloader) else None,
                )

            n_steps += 1

        model.eval()
        torch.cuda.empty_cache()
        # Compute both the token importances validation loss and the answer generation validation loss
        #val_loss, val_f1_macro = _loss_validate(model, val_dataloader, loss_function, device, print_result=False)
        # Update validation loss history
        #val_loss_history.append([n_steps, val_loss.item()])
        #val_f1_macro_history.append([n_steps, val_f1_macro])

        torch.cuda.empty_cache()

        print('-----------------------------------------------------------------------------------------------')
        print(
            f'epoch: {epoch + 1}/{epochs},',
            f'{epoch_time:.0f}s,',
            f'train loss: {running_loss / batch_steps:.3g},',
            f'train RMSE: {running_rmse / batch_steps:.3g},',
            f'lr: {optimizer.param_groups[0]["lr"]:.3g} weight decay: {optimizer.param_groups[0]["weight_decay"]}'
            #f'loss: {running_loss / batch_steps:.3g} val loss:, {val_loss.mean():.3g}, ' + 
            #f'val f1 macro: {val_f1_macro * 100:.3g} %'
            )
        print('===============================================================================================')
        
        '''if checkpoint is not None:
            checkpoint.save_best(val_f1_macro, train_loss_history=train_loss_history, val_loss_history=val_loss_history,
                            val_f1_macro_history=val_f1_macro_history)

        if early_stopping is not None:
            early_stopping.update(val_f1_macro)
            if early_stopping.is_stop_condition_met():
                print('Early stopping')
                return train_loss_history, val_loss_history, val_f1_macro_history''';
        model.train()
        lr_scheduler.step(running_rmse / batch_steps)

    '''if checkpoint is not None and reload_best_weights:
        _ = checkpoint.load_best()''';

    model.eval()
    return np.array(train_loss_history), np.array(val_loss_history), np.array(val_f1_macro_history)

    

In [28]:
print(1e-3)

0.001


In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=False, threshold=0.001, threshold_mode='rel', cooldown=0, min_lr=2e-6, eps=1e-08)

In [30]:
train(train_dataloader=train_dataloader, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, loss_function=loss, device=DEVICE, epochs=1_000)

epoch: 1/1000 [28/28], 109s 3897ms/step, loss: 60, RMSE: 42.8, lr: 0.01 weight decay: 0.01 1                
-----------------------------------------------------------------------------------------------
epoch: 1/1000, 109s, train loss: 60, train RMSE: 42.8, lr: 0.01 weight decay: 0.01
epoch: 2/1000 [28/28], 106s 3789ms/step, loss: 31.9, RMSE: 23.3, lr: 0.01 weight decay: 0.01                
-----------------------------------------------------------------------------------------------
epoch: 2/1000, 106s, train loss: 31.9, train RMSE: 23.3, lr: 0.01 weight decay: 0.01
epoch: 3/1000 [28/28], 106s 3790ms/step, loss: 6.02, RMSE: 7.29, lr: 0.01 weight decay: 0.01                
-----------------------------------------------------------------------------------------------
epoch: 3/1000, 106s, train loss: 6.02, train RMSE: 7.29, lr: 0.01 weight decay: 0.01
epoch: 4/1000 [28/28], 106s 3802ms/step, loss: 5.24, RMSE: 6.94, lr: 0.01 weight decay: 0.01               
------------------------

KeyboardInterrupt: 