In [1]:
import os, os.path
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
from models import Transformer
import dataloader

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import  DataLoader, random_split
import numpy as np
import matplotlib
matplotlib.use('AGG')
import matplotlib.pyplot as plt

In [2]:
window_size = 10
batch_size= 1024
learning_rate= 0.0001
num_epochs= 300
input_size= 1

path_train = 'dataset/structured_logs/train_data'
vocab2idx, log_dict = dataloader.to_idx(path_train)

vocab_sz = len(vocab2idx)

data = dataloader.generate_data_for_training(vocab2idx, log_dict, window_size)

train_size = int(len(data) * 0.8)
validate_size = len(data) - train_size
print("Length of dataset: {}".format(len(data)))
print("Length of training dataset: {}".format(train_size))
print("Length of validation dataset: {}".format(validate_size))

train_dataset, val_dataset = random_split(data, [train_size, validate_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle= False)

Length of dataset: 42323
Length of training dataset: 33858
Length of validation dataset: 8465


In [3]:
model = Transformer(
        in_dim= input_size,
        embed_dim= 64, 
        out_dim= vocab_sz,
        window_size= window_size,
        depth= 6,
        heads= 8,
        dim_head= 64,
        dim_ratio= 2,
        dropout= 0.1
    )

model = nn.DataParallel(model) # multi-GPU

if torch.cuda.is_available():
    model.cuda()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

# Train the model
loss_min = 99999
model_name = 'best_model.pth'
model_path = "saved_models"

save_path = os.path.join(model_path,model_name)
best_model = model
train_loss_list = []
val_loss_list = []

In [8]:
print("Begin training ......")
for epoch in range(1, num_epochs+1):  # Loop over the dataset multiple times
    train_loss = 0
    val_loss = 0

    # Training
    for step, (seq, label) in enumerate(train_loader):
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    
    ave_trainloss = train_loss / len(train_loader)
    train_loss_list.append(ave_trainloss)

    # Vaildating
    with torch.no_grad():    
        for step, (seq, label) in enumerate(val_loader):
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq)
            loss = criterion(output, label.to(device))
            val_loss += loss.item()
    
    ave_valoss = val_loss / len(val_loader)
    val_loss_list.append(ave_valoss)

    if ave_valoss < loss_min:
        loss_min = ave_valoss
        torch.save(model.state_dict(), save_path)
        best_model = model
        print("Model saved")

    print('Epoch [{}/{}], train_loss: {:.14f} val loss: {:.14f}'.format(epoch + 1, num_epochs, ave_trainloss, ave_valoss))

print(f"Finished training, model saved in: {save_path} ")


Begin training ......
Epoch [2/300], train_loss: 0.00956063642961 val loss: 0.00901817116605
Model saved
Epoch [3/300], train_loss: 0.00965904713923 val loss: 0.00783411410197
Model saved
Epoch [4/300], train_loss: 0.00695005051025 val loss: 0.00637991370079
Epoch [5/300], train_loss: 0.00672087201549 val loss: 0.00791823140266
Epoch [6/300], train_loss: 0.00602535025717 val loss: 0.00644872131266
Model saved
Epoch [7/300], train_loss: 0.00559460896843 val loss: 0.00619082721864
Epoch [8/300], train_loss: 0.00553573681014 val loss: 0.00781002995144
Epoch [9/300], train_loss: 0.00581605177799 val loss: 0.01069821439483
Model saved
Epoch [10/300], train_loss: 0.00589074199439 val loss: 0.00605714093803
Epoch [11/300], train_loss: 0.00511163687347 val loss: 0.00697922125700
Epoch [12/300], train_loss: 0.00548978195301 val loss: 0.00640616353970
Epoch [13/300], train_loss: 0.00497604420011 val loss: 0.00861442180920
Epoch [14/300], train_loss: 0.00521388845792 val loss: 0.00700374285872
Ep

In [12]:
xx = range(num_epochs)
plt.plot(xx, train_loss_list[3:], label = "Train")
plt.plot(xx, val_loss_list[3:], label = "Val")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.savefig("loss.png")