In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import os
from math import *

print("The current directory is: ")
print(os.getcwd())
if not os.getcwd().endswith("Abaqus-Hardening-Seq-2-Seq-Project"):
    # Move up two directories
    path_parent = os.path.dirname(os.getcwd())
    os.chdir(path_parent)
    path_parent = os.path.dirname(os.getcwd())
    os.chdir(path_parent)
print("The current directory is: ")
print(os.getcwd())

The current directory is: 
/scratch/project_2008630/Abaqus-Hardening-Seq-2-Seq-Project/notebooks/CP1000_RD_20C
The current directory is: 
/scratch/project_2008630/Abaqus-Hardening-Seq-2-Seq-Project


In [3]:
from configs.chosen_project import *
from src.stage1_global_configs import *

chosen_project_path = "configs/global_config_CP1000_RD_20C.json"

global_configs = main_global_configs(chosen_project_path)

all_paths = global_configs['all_paths']
objectives = global_configs['objectives']


= Stage 1: Loading configs and all paths =

Welcome to Abaqus Seq2Seq flow curve calibration project

The configurations you have chosen: 

+--------------------------+-------------------------------------------------------------+
|      Global Configs      |                         User choice                         |
+--------------------------+-------------------------------------------------------------+
|         PROJECT          |                        CP1000_RD_20C                        |
|        OBJECTIVES        |      CHD2, CHD4, NDBR2p5, NDBR6, NDBR15, NDBR40, SH115      |
|       PROJECT_PATH       | /scratch/project_2008630/Abaqus-Hardening-Seq-2-Seq-Project |
|    TRAINING_DATA_PATH    |                 training_data/CP1000_RD_20C                 |
|         LOG_PATH         |                    log/CP1000_RD_20C.txt                    |
|       MODELS_PATH        |                     models/CP1000_RD_20C                    |
|  RESULTS_INIT_DATA_PATH  |            

# Loading the training data

In [4]:
import torch

training_data_path = all_paths['training_data_path']
models_path = all_paths['models_path']

initial_train_source_sequence = torch.load(f"training_data/CP1000_RD_20C_divided_index_0/initial_train_source_original_all.pt")
initial_train_target_sequence_first = torch.load(f"training_data/CP1000_RD_20C_divided_index_0/initial_train_target_original_first.pt")
initial_test_source_sequence = torch.load(f"training_data/CP1000_RD_20C_divided_index_0/initial_test_source_original_all.pt")
initial_test_target_sequence_first = torch.load(f"training_data/CP1000_RD_20C_divided_index_0/initial_test_target_original_first.pt")

# Convert them to float32

initial_train_source_sequence = initial_train_source_sequence.float()
initial_train_target_sequence_first = initial_train_target_sequence_first.float()
initial_test_source_sequence = initial_test_source_sequence.float()
initial_test_target_sequence_first = initial_test_target_sequence_first.float()

print(f"Shape of the training source sequence: {initial_train_source_sequence.shape}")
print(f"Shape of the training target sequence: {initial_train_target_sequence_first.shape}")
print(f"Shape of the testing source sequence: {initial_test_source_sequence.shape}")
print(f"Shape of the testing target sequence: {initial_test_target_sequence_first.shape}")

# Check if any of them has NaN or infinite values

print(f"Number of NaN values in train_source_sequence: {np.isnan(initial_train_source_sequence).sum()}")
print(f"Number of NaN values in train_target_sequence: {np.isnan(initial_train_target_sequence_first).sum()}")
print(f"Number of NaN values in test_source_sequence: {np.isnan(initial_test_source_sequence).sum()}")
print(f"Number of NaN values in test_target_sequence: {np.isnan(initial_test_target_sequence_first).sum()}")

print(f"Number of infinite values in train_source_sequence: {np.isinf(initial_train_source_sequence).sum()}")
print(f"Number of infinite values in train_target_sequence: {np.isinf(initial_train_target_sequence_first).sum()}")
print(f"Number of infinite values in test_source_sequence: {np.isinf(initial_test_source_sequence).sum()}")
print(f"Number of infinite values in test_target_sequence: {np.isinf(initial_test_target_sequence_first).sum()}")

# Ensure that all target_sequence are positive
print(f"Number of negative values in train_target_sequence: {(initial_train_target_sequence_first < 0).sum()}")
print(f"Number of negative values in test_target_sequence: {(initial_test_target_sequence_first < 0).sum()}")

Shape of the training source sequence: torch.Size([192, 100, 7])
Shape of the training target sequence: torch.Size([192, 1, 1])
Shape of the testing source sequence: torch.Size([64, 100, 7])
Shape of the testing target sequence: torch.Size([64, 1, 1])
Number of NaN values in train_source_sequence: 0
Number of NaN values in train_target_sequence: 0
Number of NaN values in test_source_sequence: 0
Number of NaN values in test_target_sequence: 0
Number of infinite values in train_source_sequence: 0
Number of infinite values in train_target_sequence: 0
Number of infinite values in test_source_sequence: 0
Number of infinite values in test_target_sequence: 0
Number of negative values in train_target_sequence: 0
Number of negative values in test_target_sequence: 0


In [5]:
# Ensure that the scale of the source sequence is correct
print(initial_train_source_sequence[0][0])

tensor([11.5942,  9.5506,  9.6752,  9.4244,  9.1070,  8.7356,  1.3993])


In [6]:
# Ensure that the scale of the target sequence is correct
print(initial_train_target_sequence_first[0][0])

tensor([807.0139])


In [7]:
# Define the device

training_env = "CSC" # Choose between "local" and "CSC"
if training_env == "local":
    device = "cpu"
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


RMSE used in Transformer does not need to be weighted like the case in training bidirectional LSTM

# Start training the model

In [12]:
from optimization.transformer_helper import *
from optimization.transformer import *

def train_transformer(previous_model, current_model, previous_best_test_loss, num_epochs,
                       dropout, learning_rate, weight_decay):
    
    train_dataset = TensorDataset(initial_train_source_sequence, initial_train_target_sequence_first)
    test_dataset = TensorDataset(initial_test_source_sequence, initial_test_target_sequence_first)
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # Parameters
    _, source_len, feature_size = initial_train_source_sequence.shape
    _, label_size, _ = initial_train_target_sequence_first.shape
    
    d_model = 256
    n_heads = 16 # rule of thumb: d_model/n_heads = 16 or 32
    num_layers = 4
    dim_feedforward = 1024
    # a larger feedforward dimension (often 2-4 times d_model) helps the model to process 
    # and transform the information more effectively
    
    activation_name = "relu" # "relu" or "gelu"
    pos_enc_type="fixed" # "fixed" or "learnable"
    encoder_layer_type="LayerNorm" # "LayerNorm" or "BatchNorm"
        
    # Initialize model, loss function, and optimizer
    
    model = TransformerEncoder(feature_size, label_size, source_len,
                     d_model, n_heads, num_layers, dim_feedforward, 
                     activation_name, pos_enc_type, encoder_layer_type,
                     dropout=dropout).to(device)
    
    criterion = RMSELoss() 
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)  # Adding L2 regularization
    
    # Count the number of parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f'The model has {total_params} parameters.')
    
    if previous_model is not None:
        # Loading the best model from the previous training
        model.load_state_dict(torch.load(f"{models_path}/transformer/initial/best_model_{previous_model}.pth"))
        best_test_loss = previous_best_test_loss
    else:
        best_test_loss = float('inf')
        
    # Track the best model
    best_model_path = f"{models_path}/transformer/initial/best_model_{current_model}.pth"

    # Lists to track train and test losses
    train_losses = []
    test_losses = []
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
            
        for batch_idx, (source_batch, target_batch) in enumerate(train_loader):
            source_batch, target_batch = source_batch.to(device), target_batch.to(device)
    
            optimizer.zero_grad()
    
            # Forward pass
            outputs = model(source_batch)
            loss = criterion(outputs, target_batch)
    
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
    
            train_loss += loss.item()
    
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
    
        # Evaluate on test set
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for source_batch, target_batch in test_loader:
                source_batch, target_batch = source_batch.to(device), target_batch.to(device)
    
                # Forward pass
                outputs = model(source_batch)
                loss = criterion(outputs, target_batch)
    
                test_loss += loss.item()
    
        test_loss /= len(test_loader)
        test_losses.append(test_loss)
        
        # Save the best model
        if test_loss < best_test_loss:
            print(f"New best test loss found: {test_loss}")
            best_test_loss = test_loss
            best_model = model
            torch.save(model.state_dict(), best_model_path)
    
        # Print progress
        if (epoch+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.8f}, Test Loss: {test_loss:.8f}')
    
    # Save the train and test loss lists as .npy files
    np.save(f'{models_path}/transformer/initial/train_losses_{current_model}.npy', np.array(train_losses))
    np.save(f'{models_path}/transformer/initial/test_losses_{current_model}.npy', np.array(test_losses))
    
    torch.save(model.state_dict(), f"{models_path}/transformer/initial/last_model_{current_model}.pth")
    
    print('Training complete')
    print(f'Best model saved with test loss: {best_test_loss:.4f}')

In [14]:
train_transformer(previous_model=None, current_model="1_train_original", previous_best_test_loss=None, 
                  num_epochs=1000, dropout=0.01, learning_rate=0.0001, weight_decay=1e-3)



The model has 3186689 parameters.
New best test loss found: 869.542236328125
New best test loss found: 862.4310913085938
New best test loss found: 855.6124267578125
New best test loss found: 848.6881713867188
New best test loss found: 841.6080322265625
New best test loss found: 834.3837280273438
New best test loss found: 827.0298461914062
New best test loss found: 819.55859375
New best test loss found: 811.9977416992188
New best test loss found: 804.3422241210938
New best test loss found: 796.5868530273438
New best test loss found: 788.7526245117188
New best test loss found: 780.8709106445312
New best test loss found: 772.9435424804688
New best test loss found: 764.95751953125
New best test loss found: 756.939453125
New best test loss found: 748.8925170898438
New best test loss found: 740.811279296875
New best test loss found: 732.7067260742188
New best test loss found: 724.58154296875
New best test loss found: 716.4357299804688
New best test loss found: 708.272705078125
New best test 

In [17]:
train_transformer(previous_model="1_train_original", current_model="2_train_original", previous_best_test_loss=6.7293171882629395, 
                  num_epochs=1000, dropout=0.01, learning_rate=0.00001, weight_decay=1e-3)

The model has 3186689 parameters.
New best test loss found: 6.645035743713379
New best test loss found: 6.617631435394287
New best test loss found: 6.571237564086914
New best test loss found: 6.53059196472168
Epoch [100/1000], Train Loss: 6.04089912, Test Loss: 6.88534689
New best test loss found: 6.502954483032227
New best test loss found: 6.458877086639404
New best test loss found: 6.442684650421143
New best test loss found: 6.428459644317627
New best test loss found: 6.391922473907471
Epoch [200/1000], Train Loss: 5.67611535, Test Loss: 6.39192247
New best test loss found: 6.382574081420898
New best test loss found: 6.374117374420166
New best test loss found: 6.349586009979248
New best test loss found: 6.334413051605225
New best test loss found: 6.279034614562988
Epoch [300/1000], Train Loss: 6.37706168, Test Loss: 6.40292931
New best test loss found: 6.228068828582764
Epoch [400/1000], Train Loss: 6.95079374, Test Loss: 7.30788994
New best test loss found: 6.136126518249512
New bes

In [18]:
train_transformer(previous_model="2_train_original", current_model="3_train_original", previous_best_test_loss=5.9606032371521, 
                  num_epochs=1000, dropout=0.01, learning_rate=0.000005, weight_decay=1e-3)

The model has 3186689 parameters.
New best test loss found: 5.9406280517578125
Epoch [100/1000], Train Loss: 5.62165165, Test Loss: 7.15306950
Epoch [200/1000], Train Loss: 5.20612129, Test Loss: 6.56539202
New best test loss found: 5.932235240936279
Epoch [300/1000], Train Loss: 5.47065004, Test Loss: 6.19735193
Epoch [400/1000], Train Loss: 5.38592275, Test Loss: 5.97682095
Epoch [500/1000], Train Loss: 5.10699113, Test Loss: 6.13023663
New best test loss found: 5.930030345916748
New best test loss found: 5.874922752380371
New best test loss found: 5.863852500915527
Epoch [600/1000], Train Loss: 5.16608810, Test Loss: 6.53319931
New best test loss found: 5.848926067352295
Epoch [700/1000], Train Loss: 5.52058411, Test Loss: 6.24155807
Epoch [800/1000], Train Loss: 5.56987302, Test Loss: 6.05510569
Epoch [900/1000], Train Loss: 4.81442587, Test Loss: 6.46617746
Epoch [1000/1000], Train Loss: 4.77395010, Test Loss: 6.60230255
Training complete
Best model saved with test loss: 5.8489
