In [1]:
import json
from dataset import InterviewDataset,HierarchicalInterviewDataset
from torch.utils.data import Dataset, DataLoader
import yaml
from transformers import AlbertTokenizer,Trainer,TrainingArguments
import torch

In [15]:
config_path = "config.yaml"
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

with open(config_path, "r") as file:
    config = yaml.safe_load(file)

with open(config["train"]["train_data_path"], "r") as f:
    train_data = json.load(f)
with open(config["train"]["val_data_path"], "r") as f:
    val_data = json.load(f)
with open(config["train"]["test_data_path"], "r") as f:
    test_data = json.load(f)

tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

train_dataset = InterviewDataset(train_data, tokenizer)
val_dataset = InterviewDataset(val_data, tokenizer)
test_dataset = InterviewDataset(test_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=config["train"]["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config["train"]["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [3]:
import time
import torch
from transformers import Trainer, TrainingArguments
from model import HongzhenAlbertForRegression

model = HongzhenAlbertForRegression("albert-base-v2", num_outputs=3)
model.to(device)

HongzhenAlbertForRegression(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [4]:
model.eval()

with torch.no_grad():
    total_loss = 0
    num_batches = 0
    
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device) 
        
        loss, _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        print(_)
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
print(f"Validation Loss Before Training: {avg_loss}")

tensor([[ 0.0967,  0.4846,  0.3923],
        [-0.2326,  0.2748,  0.4676],
        [ 0.0290,  0.4234,  0.3318],
        [ 0.0973,  0.3607,  0.4090],
        [-0.0449,  0.3300,  0.3601],
        [-0.0444,  0.3254,  0.4808],
        [ 0.0092,  0.2525,  0.4913],
        [-0.0368,  0.3562,  0.4880]], device='mps:0')
tensor([[ 0.1112,  0.3248,  0.4090],
        [-0.0814,  0.3070,  0.3374],
        [-0.0473,  0.2497,  0.3739],
        [-0.0700,  0.3076,  0.4103],
        [ 0.0026,  0.3451,  0.2923],
        [-0.0264,  0.3542,  0.4904],
        [ 0.0422,  0.4384,  0.5120],
        [-0.1210,  0.2908,  0.5706]], device='mps:0')
tensor([[-0.0347,  0.4285,  0.5229],
        [-0.0660,  0.3340,  0.4349],
        [ 0.0794,  0.3655,  0.3892],
        [-0.0702,  0.4349,  0.5191],
        [-0.0394,  0.3432,  0.5054],
        [ 0.0115,  0.4470,  0.3762],
        [ 0.0476,  0.3815,  0.3595],
        [ 0.0728,  0.4012,  0.3219]], device='mps:0')
tensor([[-0.0794,  0.3223,  0.3005],
        [ 0.0306,  0.301

In [5]:
model.train()

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=config["train"]["max_epochs"],
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

start_time = time.time()

trainer.train()

end_time = time.time()

trainer.save_model("./saved_model")

training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds")

test_batch = next(iter(torch.utils.data.DataLoader(val_dataset, batch_size=4)))

input_ids = test_batch["input_ids"].to(device)
attention_mask = test_batch["attention_mask"].to(device)
labels = test_batch["labels"].to(device)

start_inference = time.time()

num_inference_steps = 100
for _ in range(num_inference_steps):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

end_inference = time.time()

total_inference_time = end_inference - start_inference
latency = total_inference_time / num_inference_steps
print(f"Average Latency: {latency:.4f} seconds per batch")

throughput = (len(input_ids) * num_inference_steps) / total_inference_time
print(f"Throughput: {throughput:.2f} samples per second")




Epoch,Training Loss,Validation Loss
1,6.4923,1.576648
2,0.5946,0.565109
3,0.6096,0.974034
4,0.4555,0.530158
5,0.7079,0.553858
6,0.5271,0.602852
7,0.3275,0.615821
8,0.4431,0.645244
9,0.3327,0.675914
10,0.1796,0.551686


Training Time: 639.98 seconds
Average Latency: 0.2546 seconds per batch
Throughput: 15.71 samples per second


In [6]:
model.eval()

with torch.no_grad():
    total_loss = 0
    num_batches = 0
    
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device) 
        
        loss, _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        print(_)
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
print(f"Validation Loss Before Training: {avg_loss}")

tensor([[5.4447, 5.4398, 5.1086],
        [5.4919, 5.2047, 5.1524],
        [4.9649, 4.6426, 4.5099],
        [5.1574, 5.1464, 4.5311],
        [4.7565, 4.5725, 4.3863],
        [4.7968, 4.4531, 4.2102],
        [5.0020, 4.8677, 4.4535],
        [5.6236, 5.5630, 4.9796]], device='mps:0')
tensor([[5.2971, 5.3734, 5.0644],
        [5.2689, 5.0842, 5.1370],
        [4.8759, 4.7342, 4.4613],
        [4.7673, 4.4050, 4.5188],
        [5.1244, 5.0401, 4.7546],
        [5.0865, 4.8014, 4.6703],
        [5.1822, 5.1073, 4.6776],
        [5.1031, 4.7859, 4.7913]], device='mps:0')
tensor([[4.7609, 4.5866, 4.7031],
        [4.9065, 4.6431, 4.5570],
        [5.5202, 5.2732, 5.3021],
        [5.0789, 4.8670, 4.8670],
        [5.2089, 5.0239, 4.6399],
        [5.0796, 4.9605, 4.6894],
        [5.4122, 5.2158, 5.1943],
        [5.1661, 5.0434, 4.6983]], device='mps:0')
tensor([[4.6604, 4.5488, 4.1597],
        [5.0230, 4.8539, 4.8330],
        [4.7692, 4.4952, 3.9408],
        [4.5431, 4.2835, 4.4594

In [19]:
model.eval()

with torch.no_grad():
    total_loss = 0
    num_batches = 0
    
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device) 
        
        loss, _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        print(loss.item(),_)
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches
print(f"Validation Loss Before Training: {avg_loss}")

2.021505832672119 tensor([[5.4674, 5.3285, 5.6962]], device='mps:0')
0.590577244758606 tensor([[5.3224, 5.1571, 5.6184]], device='mps:0')
0.02622946910560131 tensor([[4.8893, 4.6117, 4.9413]], device='mps:0')
Validation Loss Before Training: 0.8794375155121088
