In [1]:
import json
import random
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
import time

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Install Dependencies
# Run in a Jupyter cell if not already installed:
!pip install transformers datasets evaluate scikit-learn torch numpy

Using device: mps

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
with open('converted_dataset_clean.json', 'r') as file:
    data = json.load(file)
print(f"Total entries parsed: {len(data)}")

# Separate generation (CV section -> question) and classification (question-answer -> label) data
generation_data = [entry for entry in data if "output" in entry and "label" not in entry]
classification_data = [entry for entry in data if "label" in entry]

# Add technical examples to classification data
new_examples = [
    {"input": "Question: What was a key collaboration challenge you faced, and how did you address it?\nAnswer: I used Docker to streamline team workflows.", "label": "correct"},
    {"input": "Question: How do you integrate backend APIs with front-end?\nAnswer: I used Spring Boot with React for integration.", "label": "correct"},
    {"input": "Question: How did you improve debugging efficiency?\nAnswer: I used automated testing with Selenium.", "label": "correct"},
    {"input": "Question: What was a key collaboration challenge you faced, and how did you address it?\nAnswer: I ignored team issues.", "label": "incorrect"},
    {"input": "Question: How do you integrate backend APIs with front-end?\nAnswer: I avoided API integration.", "label": "incorrect"},
    {"input": "Question: How did you improve debugging efficiency?\nAnswer: I didn’t use debugging tools.", "label": "incorrect"},
    {"input": "Question: How did you optimize database performance at your job?\nAnswer: I optimized PostgreSQL queries, reducing response times by 15%.", "label": "correct"},
    {"input": "Question: How did you optimize database performance at your job?\nAnswer: I ignored database optimization.", "label": "incorrect"},
    {"input": "Question: What tools did you use for version control?\nAnswer: I implemented Git to manage code versions effectively.", "label": "correct"},
    {"input": "Question: What tools did you use for version control?\nAnswer: I used manual file tracking instead of version control.", "label": "incorrect"},
    {"input": "Question: How did you improve application scalability?\nAnswer: I deployed applications using AWS Elastic Beanstalk.", "label": "correct"},
    {"input": "Question: How did you improve application scalability?\nAnswer: I avoided scalability enhancements.", "label": "incorrect"},
    {"input": "Question: How did you manage financial reporting challenges?\nAnswer: I automated financial reports using Excel macros.", "label": "correct"},
    {"input": "Question: How did you manage financial reporting challenges?\nAnswer: I relied on manual reporting without automation.", "label": "incorrect"},
    {"input": "Question: What strategies did you use to reduce budget variances?\nAnswer: I implemented variance analysis with SAP software.", "label": "correct"},
    {"input": "Question: What strategies did you use to reduce budget variances?\nAnswer: I ignored budget variance issues.", "label": "incorrect"},
    {"input": "Question: How did you ensure specimen integrity during collection?\nAnswer: I followed OSHA guidelines for aseptic techniques.", "label": "correct"},
    {"input": "Question: How did you ensure specimen integrity during collection?\nAnswer: I skipped proper collection protocols.", "label": "incorrect"},
    {"input": "Question: What methods did you use to train new phlebotomists?\nAnswer: I conducted hands-on training with HIPAA compliance.", "label": "correct"},
    {"input": "Question: What methods did you use to train new phlebotomists?\nAnswer: I provided no formal training.", "label": "incorrect"},
    {"input": "Question: How did you enhance user interface performance?\nAnswer: I optimized CSS and JavaScript for faster rendering.", "label": "correct"},
    {"input": "Question: How did you enhance user interface performance?\nAnswer: I used unoptimized code without improvements.", "label": "incorrect"},
    {"input": "Question: What approach did you take for cloud migration?\nAnswer: I used Azure to migrate workloads with zero downtime.", "label": "correct"},
    {"input": "Question: What approach did you take for cloud migration?\nAnswer: I avoided cloud migration entirely.", "label": "incorrect"}
]
classification_data.extend(new_examples)

# Save updated dataset
with open('converted_dataset_clean_updated.json', 'w') as file:
    json.dump(classification_data + generation_data, file, indent=2)

Total entries parsed: 2541


In [23]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

generation_inputs = ["generate question: " + entry["input"] for entry in generation_data]
generation_targets = [entry["output"] for entry in generation_data]
generation_dataset = Dataset.from_dict({"input_text": generation_inputs, "target_text": generation_targets})

tokenizer_gen = T5Tokenizer.from_pretrained('t5-small')
def tokenize_generation(examples):
    model_inputs = tokenizer_gen(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer_gen(examples["target_text"], max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs
tokenized_generation = generation_dataset.map(tokenize_generation, batched=True)

train_size = int(0.8 * len(tokenized_generation))
val_size = len(tokenized_generation) - train_size
tokenized_train_gen = tokenized_generation.shuffle(seed=42).select(range(train_size))
tokenized_val_gen = tokenized_generation.shuffle(seed=42).select(range(train_size, train_size + val_size))

model_gen = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
training_args_gen = TrainingArguments(
    output_dir='./results_gen',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_gen',
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    dataloader_pin_memory=False
)
trainer_gen = Trainer(
    model=model_gen,
    args=training_args_gen,
    train_dataset=tokenized_train_gen,
    eval_dataset=tokenized_val_gen
)
trainer_gen.train()
model_gen.save_pretrained('./question_generation_model')
tokenizer_gen.save_pretrained('./question_generation_model')

Using device: mps


loading file spiece.model from cache at /Users/dasunsathsara/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/dasunsathsara/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer_config.json
loading file tokenizer.json from cache at /Users/dasunsathsara/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/860 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /Users/dasunsathsara/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_

Step,Training Loss


KeyboardInterrupt: 

In [4]:
import os
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"


In [12]:
!pip install sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/6f/ff/178f08ea5ebc1f9193d9de7f601efe78c01748347875c8438f66f5cecc19/sentence_transformers-5.0.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-5.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
!pip install transformers datasets torch nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
import math

nltk.download('punkt')

# Device setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load and deduplicate dataset
with open('enhanced_dataset_clean.json', 'r') as file:
    data = json.load(file)

seen = set()
unique_data = []
for entry in data:
    key = (entry.get("input", ""), entry.get("output", ""))
    if key not in seen and "output" in entry and "label" not in entry and entry["output"].strip():
        seen.add(key)
        unique_data.append(entry)

print(f"Total unique entries: {len(unique_data)}")

# Enhanced prompt format
def create_prompt(entry):
    input_text = entry["input"].lower()
    if "education" in input_text:
        difficulty = "easy"
        q_type = "education"
    elif any(skill in input_text for skill in ["skills", "python", "docker", "performance tuning"]):
        difficulty = "medium"
        q_type = "skills"
    else:
        difficulty = "hard"
        q_type = "project"
    return f"generate question | difficulty={difficulty} | type={q_type} | context={entry['input']} | example=What improved X at Y?"

generation_inputs = [create_prompt(entry) for entry in unique_data]
generation_targets = [entry["output"] for entry in unique_data]
generation_dataset = Dataset.from_dict({"input_text": generation_inputs, "target_text": generation_targets})

# Tokenization
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def tokenize(example):
    inputs = tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    targets = tokenizer(
        example["target_text"],
        padding="max_length",
        truncation=True,
        max_length=40  # Increased for more context
    )
    inputs["labels"] = [
        (-100 if token == tokenizer.pad_token_id else token) for token in targets["input_ids"]
    ]
    return inputs

tokenized_dataset = generation_dataset.map(tokenize, batched=True, remove_columns=["input_text", "target_text"])

# Train/val split
train_size = int(0.8 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size
train_dataset = tokenized_dataset.shuffle(seed=42).select(range(train_size))
val_dataset = tokenized_dataset.shuffle(seed=42).select(range(train_size, train_size + val_size))

# Model with dropout
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
model.config.dropout = 0.1  # Add regularization

training_args = TrainingArguments(
    output_dir="./results_gen",
    num_train_epochs=20,  # Increased for deeper learning
    per_device_train_batch_size=4,  # Increased if hardware allows
    per_device_eval_batch_size=2,
    learning_rate=2e-5,  # Lowered for stability
    logging_dir="./logs_gen",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,  # Adjusted to be a multiple of eval_steps
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
model.save_pretrained("./question_generation_model")
tokenizer.save_pretrained("./question_generation_model")

# Enhanced evaluation with post-processing
def evaluate_model(model, tokenizer, val_dataset, samples=200):
    model.eval()
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
    val_subset = val_dataset.shuffle(seed=42).select(range(min(samples, len(val_dataset))))

    total_bleu = 0
    total_sem = 0
    valid_samples = 0

    for sample in val_subset:
        input_ids = sample["input_ids"]
        input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
        target_text = tokenizer.decode(sample["labels"], skip_special_tokens=True).replace("-100", "").strip()
        target_starts_how = target_text.lower().startswith("how")

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=40, num_beams=6, temperature=0.7)  # Enhanced generation
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Post-process to match target structure
        if target_starts_how and not generated_text.lower().startswith("how"):
            generated_text = "How " + generated_text.split(" ", 1)[1] if " " in generated_text else "How " + generated_text

        reference = [target_text.split()]
        candidate = generated_text.split()
        bleu = sentence_bleu(reference, candidate) if candidate and reference[0] else 0

        emb1 = semantic_model.encode(target_text, convert_to_tensor=True)
        emb2 = semantic_model.encode(generated_text, convert_to_tensor=True)
        semantic = util.pytorch_cos_sim(emb1, emb2).item()

        if not math.isnan(bleu) and not math.isnan(semantic) and bleu > 0 and semantic > 0:
            total_bleu += bleu
            total_sem += semantic
            valid_samples += 1

            print(f"Input: {input_text}\nTarget: {target_text}\nGenerated: {generated_text}\nBLEU: {bleu:.4f}, Semantic: {semantic:.4f}\n")

    if valid_samples > 0:
        avg_bleu = total_bleu / valid_samples
        avg_sem = total_sem / valid_samples
        print(f"\nAvg BLEU: {avg_bleu:.4f}, Avg Semantic Similarity: {avg_sem:.4f}")
    else:
        print("No valid samples to evaluate.")
        avg_bleu, avg_sem = 0, 0

    return avg_bleu, avg_sem

# Run Evaluation
evaluate_model(model, tokenizer, val_dataset)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


Using device: mps
Total unique entries: 428


Map:   0%|          | 0/428 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.0244,0.035427
1000,0.0146,0.037838
1500,0.014,0.037892


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: AI Developer, SmartMind Pro, 2020-2022, Enhanced algorithm efficiency by 12%. | example=What improved X at Y?
Target: What enhanced efficiency at SmartMind Pro?
Generated: What enhanced algorithm efficiency at SmartMind Pro?
BLEU: 0.4889, Semantic: 0.9254



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2020-2022, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at TileArt Pro?
Generated: What increased sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Fashion Designer, StyleTrendz, 2020-2022, Launched 5 collections. | example=What improved X at Y?
Target: How did you launch collections at StyleTrendz?
Generated: How did you launch collections at StyleTrendz?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Coordinator, CelebrateNow Inc., 2020-2022, Increased attendance by 30% with promotions. | example=What improved X at Y?
Target: What promotions increased attendance at CelebrateNow Inc.?
Generated: What promotions increased attendance at CelebrateNow Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Stonemason, RockSolid Pro, 2020-2022, Reduced time by 12%. | example=What improved X at Y?
Target: How did you reduce time at RockSolid Pro?
Generated: How did you reduce time at RockSolid Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Clinical Researcher, HealthQuest Pro, 2023-2025, Speeded up trials by 20%. | example=What improved X at Y?
Target: How did you speed up trials at HealthQuest Pro?
Generated: How speeded trials at HealthQuest Pro?
BLEU: 0.3259, Semantic: 0.9181



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Sales Associate, RetailPeak Pro, 2022-2024, Raised sales by 10% with promotions. | example=What improved X at Y?
Target: What promotions raised sales at RetailPeak Pro?
Generated: What promotions raised sales at RetailPeak Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Doll Maker, TinyArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TinyArt Pro?
Generated: What grew sales at TinyArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Astronomer, StarGaze Pro, 2022-2024, Discovered 5 new objects. | example=What improved X at Y?
Target: What aided discoveries at StarGaze Pro?
Generated: How did you discover objects at StarGaze Pro?
BLEU: 0.0000, Semantic: 0.7271



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wood Turner, SpinArt Pro, 2022-2024, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at SpinArt Pro?
Generated: What increased sales at SpinArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Fabric Dyer, DyeArt Pro, 2022-2024, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at DyeArt Pro?
Generated: What boosted sales at DyeArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Leather Carver, HideCraft Pro, 2022-2024, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at HideCraft Pro?
Generated: What grew sales at HideCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Tattoo Artist, InkMaster Pro, 2020-2022, Grew clients by 20%. | example=What improved X at Y?
Target: What grew clients at InkMaster Pro?
Generated: What grew clients at InkMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Carver, ClearCut Pro, 2022-2024, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at ClearCut Pro?
Generated: What increased sales at ClearCut Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Support Specialist, HelpVista Inc., 2021-2023, Improved resolution rate to 90% within 24 hours. | example=What improved X at Y?
Target: What improved the resolution rate at HelpVista Inc.?
Generated: What improved resolution rate at HelpVista Inc.?
BLEU: 0.6130, Semantic: 0.9918



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Animator, MotionMagic Inc., 2019-2021, Created 20 animations. | example=What improved X at Y?
Target: How did you create animations at MotionMagic Inc.?
Generated: How did you create animations at MotionMagic Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Toy Designer, PlayTime Pro, 2020-2022, Increased sales by 20%. | example=What improved X at Y?
Target: What increased sales at PlayTime Pro?
Generated: What increased sales at PlayTime Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Support Lead, HelpCore Pro, 2020-2022, Reduced wait times by 18%. | example=What improved X at Y?
Target: What reduced wait times at HelpCore Pro?
Generated: How did you reduce wait times at HelpCore Pro?
BLEU: 0.4463, Semantic: 0.9458



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Service Manager, HelpLink Pro, 2020-2022, Raised satisfaction scores by 15%. | example=What improved X at Y?
Target: How did you raise satisfaction at HelpLink Pro?
Generated: How did you raise satisfaction scores at HelpLink Pro?
BLEU: 0.5969, Semantic: 0.9275



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Historian, TimeTrace Pro, 2022-2024, Enhanced attendance by 25%. | example=What improved X at Y?
Target: What enhanced attendance at TimeTrace Pro?
Generated: What enhanced attendance at TimeTrace Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TileArt Pro?
Generated: What grew sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Potter, ClayArt Pro, 2022-2024, Grew sales by 20%. | example=What improved X at Y?
Target: What grew sales at ClayArt Pro?
Generated: What grew sales at ClayArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Planner, CelebratePro, 2021-2023, Increased attendance by 40%. | example=What improved X at Y?
Target: What increased attendance at CelebratePro?
Generated: What increased attendance at CelebratePro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Animator, FrameMaster Pro, 2022-2024, Increased wins by 20%. | example=What improved X at Y?
Target: What increased wins at FrameMaster Pro?
Generated: What increased wins at FrameMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Electrical Engineer, PowerGrid Solutions, 2021-2023, Improved grid stability by 12%. | example=What improved X at Y?
Target: What improved grid stability at PowerGrid Solutions?
Generated: How did you improve stability at PowerGrid Solutions?
BLEU: 0.3457, Semantic: 0.9133



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Acoustics Engineer, SoundWave Pro, 2022-2024, Reduced noise by 10%. | example=What improved X at Y?
Target: What reduced noise at SoundWave Pro?
Generated: How did you reduce noise at SoundWave Pro?
BLEU: 0.3457, Semantic: 0.9287



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Painter, ColorCraft Pro, 2022-2024, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at ColorCraft Pro?
Generated: What increased sales at ColorCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Speech Therapist, VoiceCare Pro, 2023-2025, Improved clarity by 12%. | example=What improved X at Y?
Target: How did you improve clarity at VoiceCare Pro?
Generated: How improved clarity at VoiceCare Pro?
BLEU: 0.3850, Semantic: 0.9553



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Embedded Developer, CircuitEdge Tech, 2021-2023, Reduced power usage by 15% with firmware. | example=What improved X at Y?
Target: What firmware reduced power usage at CircuitEdge Tech?
Generated: How did firmware reduce power usage at CircuitEdge Tech?
BLEU: 0.4671, Semantic: 0.9464



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Puppet Maker, ToyCraft Pro, 2021-2023, Increased production by 15%. | example=What improved X at Y?
Target: What increased production at ToyCraft Pro?
Generated: What increased production at ToyCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Landscape Architect, GreenScape Pro, 2021-2023, Reduced water use by 15%. | example=What improved X at Y?
Target: How did you reduce water use at GreenScape Pro?
Generated: How did you reduce water use at GreenScape Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Legal Advisor, LawFirm Pro, 2020-2022, Won 90% of cases. | example=What improved X at Y?
Target: What won cases at LawFirm Pro?
Generated: How did you win cases at LawFirm Pro?
BLEU: 0.3457, Semantic: 0.9070



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Doll Maker, TinyTreasures Pro, 2021-2023, Boosted sales by 18%. | example=What improved X at Y?
Target: What boosted sales at TinyTreasures Pro?
Generated: What boosted sales at TinyTreasures Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Software Architect, CodeCraft Pro, 2022-2024, Reduced technical debt by 20%. | example=What improved X at Y?
Target: What reduced technical debt at CodeCraft Pro?
Generated: How did you reduce technical debt at CodeCraft Pro?
BLEU: 0.4463, Semantic: 0.9572



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Basket Weaver, WeaveMaster Pro, 2020-2022, Increased sales by 15%. | example=What improved X at Y?
Target: What increased sales at WeaveMaster Pro?
Generated: What increased sales at WeaveMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Safety Coordinator, SafeWork Pro, 2022-2024, Decreased incidents by 18% with training. | example=What improved X at Y?
Target: How did training decrease incidents at SafeWork Pro?
Generated: How did training reduce incidents at SafeWork Pro?
BLEU: 0.5000, Semantic: 0.9844



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Bead Weaver, GemBead Pro, 2020-2022, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at GemBead Pro?
Generated: What grew sales at GemBead Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mathematician, NumSolve Labs, 2019-2021, Solved 10 complex problems. | example=What improved X at Y?
Target: How did you solve problems at NumSolve Labs?
Generated: How did you solve complex problems at NumSolve Labs?
BLEU: 0.5969, Semantic: 0.9164



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Warehouse Supervisor, StockFlow Inc., 2019-2021, Increased efficiency by 30%. | example=What improved X at Y?
Target: What increased efficiency at StockFlow Inc.?
Generated: What increased efficiency at StockFlow Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Software Engineer, CodeForge Inc., 2022-2024, Reduced bug rates by 25% with code reviews. | example=What improved X at Y?
Target: What code reviews reduced bug rates at CodeForge Inc.?
Generated: How did code reviews reduce bug rates at CodeForge Inc.?
BLEU: 0.4518, Semantic: 0.9417



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Carver, ClearCut Pro, 2020-2022, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at ClearCut Pro?
Generated: What grew sales at ClearCut Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Rug Maker, WeftCraft Pro, 2021-2023, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at WeftCraft Pro?
Generated: What grew sales at WeftCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mobile Developer, AppRise Tech, 2020-2022, Increased app downloads by 20% with features. | example=What improved X at Y?
Target: What features increased downloads at AppRise Tech?
Generated: What features increased downloads at AppRise Tech?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Systems Analyst, TechFlow Innovations, 2023-2025, Enhanced system uptime to 99.5%. | example=What improved X at Y?
Target: How did you enhance uptime at TechFlow Innovations?
Generated: How enhanced system uptime at TechFlow Innovations?
BLEU: 0.3768, Semantic: 0.8480



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Operations Analyst, StreamlinePro Inc., 2022-2024, Reduced process delays by 20% with optimization. | example=What improved X at Y?
Target: How did optimization reduce delays at StreamlinePro Inc.?
Generated: How did optimization reduce delays at StreamlinePro Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Metal Engraver, SteelMark Pro, 2021-2023, Grew orders by 10%. | example=What improved X at Y?
Target: What grew orders at SteelMark Pro?
Generated: What grew orders at SteelMark Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Quilt Maker, PatchWork Pro, 2022-2024, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at PatchWork Pro?
Generated: What increased sales at PatchWork Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wildlife Ranger, NatureGuard Pro, 2023-2025, Reduced poaching by 12%. | example=What improved X at Y?
Target: How did you reduce poaching at NatureGuard Pro?
Generated: How did you reduce poaching at NatureGuard Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Lace Maker, ThreadLace Pro, 2023-2025, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at ThreadLace Pro?
Generated: What increased sales at ThreadLace Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Compliance Manager, ReguSafe Inc., 2020-2022, Achieved 95% audit compliance with policies. | example=What improved X at Y?
Target: How did policies ensure compliance at ReguSafe Inc.?
Generated: How did you achieve compliance with policies at ReguSafe Inc.?
BLEU: 0.0000, Semantic: 0.9433



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Leatherworker, HideCraft Pro, 2022-2024, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at HideCraft Pro?
Generated: What increased sales at HideCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2022-2024, Boosted sales by 20%. | example=What improved X at Y?
Target: What boosted sales at TileArt Pro?
Generated: What boosted sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Tapestry Weaver, ThreadTapestry Pro, 2021-2023, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at ThreadTapestry Pro?
Generated: What boosted sales at ThreadTapestry Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Training Coordinator, SkillUp Corp., 2019-2021, Trained 200+ employees. | example=What improved X at Y?
Target: How did you train employees at SkillUp Corp.?
Generated: How did you train employees at SkillUp Corp.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Candle Designer, WaxDesign Pro, 2020-2022, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at WaxDesign Pro?
Generated: What increased sales at WaxDesign Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Bead Weaver, GemBead Pro, 2023-2025, Boosted sales by 20%. | example=What improved X at Y?
Target: What boosted sales at GemBead Pro?
Generated: What boosted sales at GemBead Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Soap Maker, PureGlow Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at PureGlow Pro?
Generated: What grew sales at PureGlow Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wood Sculptor, TimberSculpt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TimberSculpt Pro?
Generated: What grew sales at TimberSculpt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Puppeteer, PuppetPlay Pro, 2020-2022, Boosted attendance by 18%. | example=What improved X at Y?
Target: What boosted attendance at PuppetPlay Pro?
Generated: What boosted attendance at PuppetPlay Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Urban Planner, CityBuild Pro, 2020-2022, Improved traffic by 15%. | example=What improved X at Y?
Target: What improved traffic at CityBuild Pro?
Generated: How did you improve traffic at CityBuild Pro?
BLEU: 0.3457, Semantic: 0.9566



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Game Developer, PlayForge Studio, 2020-2022, Released 2 hit games. | example=What improved X at Y?
Target: How did you release games at PlayForge Studio?
Generated: How did you release hit games at PlayForge Studio?
BLEU: 0.5969, Semantic: 0.9478



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Quilt Maker, PatchWork Pro, 2023-2025, Boosted sales by 15%. | example=What improved X at Y?
Target: What boosted sales at PatchWork Pro?
Generated: What boosted sales at PatchWork Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Research Scientist, BioPeak Labs, 2021-2023, Accelerated experiments by 20% with tools. | example=What improved X at Y?
Target: What tools accelerated experiments at BioPeak Labs?
Generated: How did tools accelerate experiments at BioPeak Labs?
BLEU: 0.3656, Semantic: 0.9423



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Basket Weaver, WeaveArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at WeaveArt Pro?
Generated: What grew sales at WeaveArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Candle Maker, GlowLight Pro, 2020-2022, Grew sales by 20%. | example=What improved X at Y?
Target: What grew sales at GlowLight Pro?
Generated: What grew sales at GlowLight Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Paper Artist, FoldArt Pro, 2023-2025, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at FoldArt Pro?
Generated: What increased sales at FoldArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Real Estate Agent, HomeQuest Pro, 2022-2024, Increased sales by 25%. | example=What improved X at Y?
Target: What increased sales at HomeQuest Pro?
Generated: What increased sales at HomeQuest Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Business Development Manager, GrowthPulse Pro, 2022-2024, Secured 10 new contracts. | example=What improved X at Y?
Target: What secured contracts at GrowthPulse Pro?
Generated: How did you secure contracts at GrowthPulse Pro?
BLEU: 0.3457, Semantic: 0.9421



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Toy Maker, PlayCraft Pro, 2020-2022, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at PlayCraft Pro?
Generated: What increased sales at PlayCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Optometrist, ClearVision Center, 2020-2022, Reduced appointment delays by 15%. | example=What improved X at Y?
Target: What reduced appointment delays at ClearVision Center?
Generated: How did you reduce appointment delays at ClearVision Center?
BLEU: 0.4463, Semantic: 0.9595



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mobile App Developer, AppTrendz, 2020-2022, Grew user base by 20%. | example=What improved X at Y?
Target: What grew the user base at AppTrendz?
Generated: What grew user base at AppTrendz?
BLEU: 0.5115, Semantic: 0.9938



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glassblower, ClearCraft Pro, 2023-2025, Boosted sales by 18%. | example=What improved X at Y?
Target: What boosted sales at ClearCraft Pro?
Generated: What boosted sales at ClearCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Archivist, HistoryPreserve Pro, 2021-2023, Improved access by 20%. | example=What improved X at Y?
Target: What improved access at HistoryPreserve Pro?
Generated: How did you improve access at HistoryPreserve Pro?
BLEU: 0.3457, Semantic: 0.9459



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Metal Engraver, SteelArt Pro, 2020-2022, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at SteelArt Pro?
Generated: What boosted sales at SteelArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Photographer, LensPro Pro, 2022-2024, Grew clients by 20%. | example=What improved X at Y?
Target: What grew clients at LensPro Pro?
Generated: What grew clients at LensPro Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Content Creator, MediaMix Pro, 2022-2024, Grew audience by 30% with videos. | example=What improved X at Y?
Target: What videos grew the audience at MediaMix Pro?
Generated: What videos grew audience at MediaMix Pro?
BLEU: 0.5154, Semantic: 0.9937



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Systems Architect, TechBuild Corp., 2020-2022, Enhanced system scalability by 15% with redesigns. | example=What improved X at Y?
Target: How did redesigns enhance scalability at TechBuild Corp.?
Generated: How redesigns enhanced system scalability at TechBuild Corp.?
BLEU: 0.3826, Semantic: 0.9075



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Photographer, SnapMoment Pro, 2022-2024, Increased bookings by 25%. | example=What improved X at Y?
Target: What increased bookings at SnapMoment Pro?
Generated: What increased bookings at SnapMoment Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Digital Strategist, MediaPulse Agency, 2023-2025, Boosted ad conversions by 30% with analytics. | example=What improved X at Y?
Target: What analytics boosted conversions at MediaPulse Agency?
Generated: What analytics boosted conversions at MediaPulse Agency?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Logistics Manager, FreightMaster Inc., 2021-2023, Reduced shipping costs by $300K annually. | example=What improved X at Y?
Target: How did you reduce shipping costs at FreightMaster Inc.?
Generated: How did you reduce shipping costs at FreightMaster Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Transportation Planner, MoveEasy Pro, 2021-2023, Reduced fuel costs by 12%. | example=What improved X at Y?
Target: How did you reduce fuel costs at MoveEasy Pro?
Generated: How did you reduce costs at MoveEasy Pro?
BLEU: 0.6102, Semantic: 0.7977



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Veterinary Assistant, PetCare Pro, 2021-2023, Improved recovery by 10%. | example=What improved X at Y?
Target: What improved recovery at PetCare Pro?
Generated: What improved recovery at PetCare Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Blower, ClearWind Pro, 2021-2023, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at ClearWind Pro?
Generated: What grew sales at ClearWind Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Network Administrator, ConnectPro Systems, 2021-2023, Reduced downtime by 18% with network upgrades. | example=What improved X at Y?
Target: How did upgrades reduce downtime at ConnectPro Systems?
Generated: How network upgrades reduced downtime at ConnectPro Systems?
BLEU: 0.3826, Semantic: 0.9328



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Marine Biologist, OceanWatch Pro, 2022-2024, Protected 10% more species. | example=What improved X at Y?
Target: How did you protect species at OceanWatch Pro?
Generated: How did you protect species at OceanWatch Pro?
BLEU: 1.0000, Semantic: 1.0000

Input: generate question | difficulty=hard | type=project | context=CV section: Childcare Worker, LittleSteps Pro, 2022-2024, Improved development by 12%. | example=What improved X at Y?
Target: How did you improve development at LittleSteps Pro?
Generated: How improved development at LittleSteps Pro?
BLEU: 0.3850, Semantic: 0.9547


Avg BLEU: 0.8019, Avg Semantic Similarity: 0.9762


(0.8019254150813642, 0.9761822445448055)

In [3]:
import json
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device} at 01:54 PM +0530 on Monday, July 28, 2025")

# Load the dataset
with open('converted_dataset_clean.json', 'r') as file:
    data = json.load(file)
print(f"Total entries parsed: {len(data)}")

classification_data = [entry for entry in data if "label" in entry]

# Augment classification data with balanced examples
new_examples = [
    {"input": "Question: What was a key collaboration challenge?\nAnswer: I used Docker.", "label": "correct"},
    {"input": "Question: How do you integrate APIs?\nAnswer: I used Spring Boot.", "label": "correct"},
    {"input": "Question: How did you debug?\nAnswer: I used Selenium.", "label": "correct"},
    {"input": "Question: What was a challenge?\nAnswer: I ignored it.", "label": "incorrect"},
    {"input": "Question: How do you optimize?\nAnswer: I avoided it.", "label": "incorrect"},
    {"input": "Question: What tools did you use?\nAnswer: I used Git.", "label": "correct"},
]
classification_data.extend(new_examples)
with open('converted_dataset_clean_updated.json', 'w') as file:
    json.dump(classification_data, file, indent=2)

# Prepare classification data
classification_inputs = [entry["input"] for entry in classification_data]
classification_labels = [1 if entry["label"] == "correct" else 0 for entry in classification_data]
classification_dataset = Dataset.from_dict({"text": classification_inputs, "label": classification_labels})

# Tokenize the dataset
tokenizer_cls = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_classification(examples):
    return tokenizer_cls(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_classification = classification_dataset.map(tokenize_classification, batched=True)

# Convert to list for sklearn compatibility
tokenized_data_list = tokenized_classification.to_list()

# Split the dataset
train_val_data, test_data = train_test_split(tokenized_data_list, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)

# Convert back to Dataset objects
tokenized_train_cls = Dataset.from_list(train_data)
tokenized_val_cls = Dataset.from_list(val_data)
tokenized_test_cls = Dataset.from_list(test_data)

# Initialize and configure the model
model_cls = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=0.2).to(device)
train_labels = [example["label"] for example in tokenized_train_cls]
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=train_labels)
print(f"Class weights: {class_weights}")  # Debug class weights
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Define custom trainer with weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels").to(device)
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args_cls = TrainingArguments(
    output_dir='./results_cls',
    num_train_epochs=15,  # Increased from 10 for better learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # Adjusted for better convergence
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    dataloader_pin_memory=False
)

# Initialize trainer
trainer_cls = WeightedTrainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=tokenized_train_cls,
    eval_dataset=tokenized_val_cls,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, np.argmax(pred.predictions, axis=1))}
)
trainer_cls.train()

# Save the model and tokenizer
model_cls.save_pretrained('./answer_evaluation_model')
tokenizer_cls.save_pretrained('./answer_evaluation_model')

# Evaluate BERT accuracy on test set
trainer_cls = WeightedTrainer(
    model=model_cls,
    args=training_args_cls,
    eval_dataset=tokenized_test_cls,
    compute_metrics=lambda pred: {
        "accuracy": accuracy_score(pred.label_ids, np.argmax(pred.predictions, axis=1)),
        "precision": precision_score(pred.label_ids, np.argmax(pred.predictions, axis=1)),
        "recall": recall_score(pred.label_ids, np.argmax(pred.predictions, axis=1)),
        "f1": f1_score(pred.label_ids, np.argmax(pred.predictions, axis=1))
    }
)
test_results = trainer_cls.evaluate()
print("BERT Test Set Results:", test_results)

predictions = trainer_cls.predict(tokenized_test_cls)
labels = predictions.label_ids
preds = np.argmax(predictions.predictions, axis=1)
print(f"Accuracy: {accuracy_score(labels, preds):.4f}")
print(f"Precision: {precision_score(labels, preds):.4f}")
print(f"Recall: {recall_score(labels, preds):.4f}")
print(f"F1 Score: {f1_score(labels, preds):.4f}")

Using device: mps at 01:54 PM +0530 on Monday, July 28, 2025
Total entries parsed: 2541


Map:   0%|          | 0/1686 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: [0.99312377 1.00697211]


Step,Training Loss,Validation Loss,Accuracy
500,0.034,0.020113,0.997033
1000,0.0082,0.207527,0.970326
1500,0.0002,0.122168,0.985163


BERT Test Set Results: {'eval_loss': 0.18265974521636963, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': 0.9704142011834319, 'eval_precision': 0.943502824858757, 'eval_recall': 1.0, 'eval_f1': 0.9709302325581395, 'eval_runtime': 35.5487, 'eval_samples_per_second': 9.508, 'eval_steps_per_second': 1.21}
Accuracy: 0.9704
Precision: 0.9435
Recall: 1.0000
F1 Score: 0.9709


In [7]:
import os
model_dir = './answer_evaluation_model'
print("Current working directory:", os.getcwd())
print("Contents of ./answer_evaluation_model:", os.listdir(model_dir))
print("Does pytorch_model.bin exist?", 'pytorch_model.bin' in os.listdir(model_dir))

Current working directory: /Users/dasunsathsara/VVH/FinalYearProject-SmartHire-Backend/app
Contents of ./answer_evaluation_model: ['model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json', 'vocab.txt']
Does pytorch_model.bin exist? False


In [10]:
import json
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
import nltk
import os
import math

nltk.download('punkt')

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
print(f"Current working directory: {os.getcwd()} at 09:20 AM +0530 on Sunday, August 10, 2025")

# Load and deduplicate dataset
try:
    with open('enhanced_dataset_clean.json', 'r') as file:
        data = json.load(file)
except FileNotFoundError:
    print("Error: enhanced_dataset_clean.json not found in the current directory.")
    exit(1)
print(f"Total entries parsed from enhanced_dataset_clean.json: {len(data)}")

seen = set()
unique_data = []
for entry in data:
    key = (entry.get("input", ""), entry.get("output", ""), entry.get("label", ""))
    if key not in seen and ("output" in entry or "label" in entry):
        seen.add(key)
        unique_data.append(entry)
print(f"Total unique entries in enhanced_dataset_clean.json: {len(unique_data)}")

# Load classification data from converted_dataset_clean_updated.json or converted_dataset_clean.json
classification_data = []
try:
    with open('converted_dataset_clean_updated.json', 'r') as file:
        classification_data = json.load(file)
except FileNotFoundError:
    try:
        with open('converted_dataset_clean.json', 'r') as file:
            classification_data = json.load(file)
    except FileNotFoundError:
        print("Error: Neither converted_dataset_clean_updated.json nor converted_dataset_clean.json found.")
print(f"Classification entries parsed: {len(classification_data)}")

# Filter classification data to ensure only entries with "label" are included
classification_data = [entry for entry in classification_data if "label" in entry]
print(f"Filtered classification entries: {len(classification_data)}")

# Prepare generation and classification data
generation_data = [entry for entry in unique_data if "output" in entry and "label" not in entry and entry["output"].strip()]
print(f"Generation entries: {len(generation_data)}")
print(f"Classification entries: {len(classification_data)}")
print("Sample generation entry:", generation_data[:2] if generation_data else "No generation entries")
print("Sample classification entry:", classification_data[:2] if classification_data else "No classification entries")
if classification_data:
    print(f"Class distribution: {{'correct': {sum(1 for e in classification_data if e['label'] == 'correct')}, 'incorrect': {sum(1 for e in classification_data if e['label'] == 'incorrect')}}}")

# Step 1: Evaluate T5 Model (Question Generation)
try:
    tokenizer_gen = T5Tokenizer.from_pretrained('./question_generation_model')
    model_gen = T5ForConditionalGeneration.from_pretrained('./question_generation_model').to(device)
except Exception as e:
    print(f"Error loading T5 model: {e}")
    model_gen = None

if model_gen and generation_data:
    # Enhanced prompt format
    def create_prompt(entry):
        input_text = entry["input"].lower()
        if "education" in input_text:
            difficulty = "easy"
            q_type = "education"
        elif any(skill in input_text for skill in ["skills", "python", "docker", "performance tuning"]):
            difficulty = "medium"
            q_type = "skills"
        else:
            difficulty = "hard"
            q_type = "project"
        return f"generate question | difficulty={difficulty} | type={q_type} | context={entry['input']} | example=What improved X at Y?"

    generation_inputs = [create_prompt(entry) for entry in generation_data]
    generation_targets = [entry["output"] for entry in generation_data]
    generation_dataset = Dataset.from_dict({"input_text": generation_inputs, "target_text": generation_targets})

    def tokenize_generation(examples):
        model_inputs = tokenizer_gen(examples["input_text"], max_length=512, truncation=True, padding="max_length")
        labels = tokenizer_gen(examples["target_text"], max_length=128, truncation=True, padding="max_length").input_ids
        model_inputs["labels"] = labels
        return model_inputs

    tokenized_generation = generation_dataset.map(tokenize_generation, batched=True)
    train_size = int(0.8 * len(tokenized_generation))
    val_size = len(tokenized_generation) - train_size
    tokenized_val_gen = tokenized_generation.shuffle(seed=42).select(range(train_size, train_size + val_size))

    def evaluate_t5_accuracy(model, tokenizer, val_dataset, num_samples=200):
        model.eval()
        semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
        total_bleu = 0
        total_sem = 0
        valid_samples = 0
        val_samples = val_dataset.shuffle(seed=42).select(range(min(num_samples, len(val_dataset))))

        for sample in val_samples:
            input_text = sample["input_text"]
            target_text = tokenizer.decode(sample["labels"], skip_special_tokens=True).strip()
            target_starts_how = target_text.lower().startswith("how")
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            with torch.no_grad():
                outputs = model.generate(**inputs, max_length=128, num_beams=6, temperature=0.7)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # Post-process to match target structure
            if target_starts_how and not generated_text.lower().startswith("how"):
                generated_text = "How " + generated_text.split(" ", 1)[1] if " " in generated_text else "How " + generated_text

            reference = [target_text.split()]
            candidate = generated_text.split()
            bleu = sentence_bleu(reference, candidate) if candidate and reference[0] else 0

            emb1 = semantic_model.encode(target_text, convert_to_tensor=True)
            emb2 = semantic_model.encode(generated_text, convert_to_tensor=True)
            semantic = util.pytorch_cos_sim(emb1, emb2).item()

            if not math.isnan(bleu) and not math.isnan(semantic):
                total_bleu += bleu
                total_sem += semantic
                valid_samples += 1
                print(f"Input: {input_text}\nTarget: {target_text}\nGenerated: {generated_text}\nBLEU: {bleu:.4f}, Semantic: {semantic:.4f}\n")

        if valid_samples > 0:
            avg_bleu = total_bleu / valid_samples
            avg_sem = total_sem / valid_samples
            print(f"T5 Evaluation on {valid_samples} samples: Avg BLEU: {avg_bleu:.4f}, Avg Semantic Similarity: {avg_sem:.4f}")
        else:
            print("No valid samples to evaluate.")
            avg_bleu, avg_sem = 0, 0

        return avg_bleu, avg_sem

    t5_bleu, t5_sem = evaluate_t5_accuracy(model_gen, tokenizer_gen, tokenized_val_gen)
else:
    print("No generation data or T5 model available. Skipping T5 evaluation.")
    t5_bleu, t5_sem = 0, 0

# Step 2: Evaluate BERT Model (Answer Evaluation)
model_dir = os.path.abspath('./answer_evaluation_model')
print(f"Checking model directory: {model_dir}")
print(f"Directory contents: {os.listdir(model_dir)}")
if not os.path.exists(model_dir) or ('pytorch_model.bin' not in os.listdir(model_dir) and 'model.safetensors' not in os.listdir(model_dir)):
    print(f"Error: Model weights (pytorch_model.bin or model.safetensors) not found in {model_dir}. Please re-run the training script.")
else:
    try:
        tokenizer_cls = BertTokenizer.from_pretrained(model_dir)
        model_cls = BertForSequenceClassification.from_pretrained(model_dir, use_safetensors=True).to(device)
    except Exception as e:
        print(f"Error loading BERT model: {e}")
        model_cls = None
    else:
        if classification_data:
            classification_inputs = [entry["input"] for entry in classification_data]
            classification_labels = [1 if entry["label"] == "correct" else 0 for entry in classification_data]
            classification_dataset = Dataset.from_dict({"text": classification_inputs, "label": classification_labels})

            def tokenize_classification(examples):
                return tokenizer_cls(examples["text"], padding="max_length", truncation=True, max_length=512)

            tokenized_classification = classification_dataset.map(tokenize_classification, batched=True)
            tokenized_data_list = tokenized_classification.to_list()

            if tokenized_data_list:
                train_val_data, test_data = train_test_split(tokenized_data_list, test_size=0.2, random_state=42)
                tokenized_test_cls = Dataset.from_list(test_data)

                class WeightedTrainer(Trainer):
                    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
                        labels = inputs.get("labels").to(device)
                        outputs = model(**inputs)
                        logits = outputs.get("logits")
                        loss_fct = torch.nn.CrossEntropyLoss()
                        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                        return (loss, outputs) if return_outputs else loss

                training_args_cls = TrainingArguments(
                    output_dir='./results_cls_eval',
                    per_device_eval_batch_size=8,
                    dataloader_pin_memory=False
                )

                trainer_cls = WeightedTrainer(
                    model=model_cls,
                    args=training_args_cls,
                    eval_dataset=tokenized_test_cls,
                    compute_metrics=lambda pred: {
                        "accuracy": accuracy_score(pred.label_ids, np.argmax(pred.predictions, axis=1)),
                        "precision": precision_score(pred.label_ids, np.argmax(pred.predictions, axis=1), zero_division=0),
                        "recall": recall_score(pred.label_ids, np.argmax(pred.predictions, axis=1), zero_division=0),
                        "f1": f1_score(pred.label_ids, np.argmax(pred.predictions, axis=1), zero_division=0)
                    }
                )

                # Evaluate BERT model
                test_results = trainer_cls.evaluate()
                print("BERT Test Set Results:", test_results)

                # Additional detailed metrics
                predictions = trainer_cls.predict(tokenized_test_cls)
                labels = predictions.label_ids
                preds = np.argmax(predictions.predictions, axis=1)
                print(f"Accuracy: {accuracy_score(labels, preds):.4f}")
                print(f"Precision: {precision_score(labels, preds, zero_division=0):.4f}")
                print(f"Recall: {recall_score(labels, preds, zero_division=0):.4f}")
                print(f"F1 Score: {f1_score(labels, preds, zero_division=0):.4f}")
            else:
                print("No tokenized classification data available. Skipping BERT evaluation.")
        else:
            print("No classification data available. Skipping BERT evaluation.")

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


Using device: mps
Current working directory: /Users/dasunsathsara/VVH/FinalYearProject-SmartHire-Backend/app at 09:20 AM +0530 on Sunday, August 10, 2025
Total entries parsed from enhanced_dataset_clean.json: 428
Total unique entries in enhanced_dataset_clean.json: 428
Classification entries parsed: 1686
Filtered classification entries: 1686
Generation entries: 428
Classification entries: 1686
Sample generation entry: [{'input': 'CV section: Marketing Analyst, TrendWave Corp., 2023-2025, Increased lead generation by 40% with targeted campaigns.', 'output': 'How did you increase lead generation at TrendWave Corp.?'}, {'input': 'CV section: Software Engineer, CodeForge Inc., 2022-2024, Reduced bug rates by 25% with code reviews.', 'output': 'What code reviews reduced bug rates at CodeForge Inc.?'}]
Sample classification entry: [{'input': 'Question: What educational qualifications do you have in accounting?\nAnswer: I have an Associate in Accounting from Northern Maine Community College w

Map:   0%|          | 0/428 [00:00<?, ? examples/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: AI Developer, SmartMind Pro, 2020-2022, Enhanced algorithm efficiency by 12%. | example=What improved X at Y?
Target: What enhanced efficiency at SmartMind Pro?
Generated: What enhanced algorithm efficiency at SmartMind Pro?
BLEU: 0.4889, Semantic: 0.9254



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2020-2022, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at TileArt Pro?
Generated: What increased sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Fashion Designer, StyleTrendz, 2020-2022, Launched 5 collections. | example=What improved X at Y?
Target: How did you launch collections at StyleTrendz?
Generated: How did you launch collections at StyleTrendz?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Coordinator, CelebrateNow Inc., 2020-2022, Increased attendance by 30% with promotions. | example=What improved X at Y?
Target: What promotions increased attendance at CelebrateNow Inc.?
Generated: What promotions increased attendance at CelebrateNow Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Stonemason, RockSolid Pro, 2020-2022, Reduced time by 12%. | example=What improved X at Y?
Target: How did you reduce time at RockSolid Pro?
Generated: How did you reduce time at RockSolid Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Clinical Researcher, HealthQuest Pro, 2023-2025, Speeded up trials by 20%. | example=What improved X at Y?
Target: How did you speed up trials at HealthQuest Pro?
Generated: How speeded trials at HealthQuest Pro?
BLEU: 0.3259, Semantic: 0.9181



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Sales Associate, RetailPeak Pro, 2022-2024, Raised sales by 10% with promotions. | example=What improved X at Y?
Target: What promotions raised sales at RetailPeak Pro?
Generated: What promotions raised sales at RetailPeak Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Doll Maker, TinyArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TinyArt Pro?
Generated: What grew sales at TinyArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Astronomer, StarGaze Pro, 2022-2024, Discovered 5 new objects. | example=What improved X at Y?
Target: What aided discoveries at StarGaze Pro?
Generated: How did you discover objects at StarGaze Pro?
BLEU: 0.0000, Semantic: 0.7271



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wood Turner, SpinArt Pro, 2022-2024, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at SpinArt Pro?
Generated: What increased sales at SpinArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Fabric Dyer, DyeArt Pro, 2022-2024, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at DyeArt Pro?
Generated: What boosted sales at DyeArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Leather Carver, HideCraft Pro, 2022-2024, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at HideCraft Pro?
Generated: What grew sales at HideCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Tattoo Artist, InkMaster Pro, 2020-2022, Grew clients by 20%. | example=What improved X at Y?
Target: What grew clients at InkMaster Pro?
Generated: What grew clients at InkMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Carver, ClearCut Pro, 2022-2024, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at ClearCut Pro?
Generated: What increased sales at ClearCut Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Support Specialist, HelpVista Inc., 2021-2023, Improved resolution rate to 90% within 24 hours. | example=What improved X at Y?
Target: What improved the resolution rate at HelpVista Inc.?
Generated: What improved resolution rate at HelpVista Inc.?
BLEU: 0.6130, Semantic: 0.9918



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Animator, MotionMagic Inc., 2019-2021, Created 20 animations. | example=What improved X at Y?
Target: How did you create animations at MotionMagic Inc.?
Generated: How did you create animations at MotionMagic Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Toy Designer, PlayTime Pro, 2020-2022, Increased sales by 20%. | example=What improved X at Y?
Target: What increased sales at PlayTime Pro?
Generated: What increased sales at PlayTime Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Support Lead, HelpCore Pro, 2020-2022, Reduced wait times by 18%. | example=What improved X at Y?
Target: What reduced wait times at HelpCore Pro?
Generated: How did you reduce wait times at HelpCore Pro?
BLEU: 0.4463, Semantic: 0.9458



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Customer Service Manager, HelpLink Pro, 2020-2022, Raised satisfaction scores by 15%. | example=What improved X at Y?
Target: How did you raise satisfaction at HelpLink Pro?
Generated: How did you raise satisfaction scores at HelpLink Pro?
BLEU: 0.5969, Semantic: 0.9275



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Historian, TimeTrace Pro, 2022-2024, Enhanced attendance by 25%. | example=What improved X at Y?
Target: What enhanced attendance at TimeTrace Pro?
Generated: What enhanced attendance at TimeTrace Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TileArt Pro?
Generated: What grew sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Potter, ClayArt Pro, 2022-2024, Grew sales by 20%. | example=What improved X at Y?
Target: What grew sales at ClayArt Pro?
Generated: What grew sales at ClayArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Planner, CelebratePro, 2021-2023, Increased attendance by 40%. | example=What improved X at Y?
Target: What increased attendance at CelebratePro?
Generated: What increased attendance at CelebratePro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Animator, FrameMaster Pro, 2022-2024, Increased wins by 20%. | example=What improved X at Y?
Target: What increased wins at FrameMaster Pro?
Generated: What increased wins at FrameMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Electrical Engineer, PowerGrid Solutions, 2021-2023, Improved grid stability by 12%. | example=What improved X at Y?
Target: What improved grid stability at PowerGrid Solutions?
Generated: How did you improve stability at PowerGrid Solutions?
BLEU: 0.3457, Semantic: 0.9133



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Acoustics Engineer, SoundWave Pro, 2022-2024, Reduced noise by 10%. | example=What improved X at Y?
Target: What reduced noise at SoundWave Pro?
Generated: How did you reduce noise at SoundWave Pro?
BLEU: 0.3457, Semantic: 0.9287



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Painter, ColorCraft Pro, 2022-2024, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at ColorCraft Pro?
Generated: What increased sales at ColorCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Speech Therapist, VoiceCare Pro, 2023-2025, Improved clarity by 12%. | example=What improved X at Y?
Target: How did you improve clarity at VoiceCare Pro?
Generated: How improved clarity at VoiceCare Pro?
BLEU: 0.3850, Semantic: 0.9553



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Embedded Developer, CircuitEdge Tech, 2021-2023, Reduced power usage by 15% with firmware. | example=What improved X at Y?
Target: What firmware reduced power usage at CircuitEdge Tech?
Generated: How did firmware reduce power usage at CircuitEdge Tech?
BLEU: 0.4671, Semantic: 0.9464



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Puppet Maker, ToyCraft Pro, 2021-2023, Increased production by 15%. | example=What improved X at Y?
Target: What increased production at ToyCraft Pro?
Generated: What increased production at ToyCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Landscape Architect, GreenScape Pro, 2021-2023, Reduced water use by 15%. | example=What improved X at Y?
Target: How did you reduce water use at GreenScape Pro?
Generated: How did you reduce water use at GreenScape Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Legal Advisor, LawFirm Pro, 2020-2022, Won 90% of cases. | example=What improved X at Y?
Target: What won cases at LawFirm Pro?
Generated: How did you win cases at LawFirm Pro?
BLEU: 0.3457, Semantic: 0.9070



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Doll Maker, TinyTreasures Pro, 2021-2023, Boosted sales by 18%. | example=What improved X at Y?
Target: What boosted sales at TinyTreasures Pro?
Generated: What boosted sales at TinyTreasures Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Software Architect, CodeCraft Pro, 2022-2024, Reduced technical debt by 20%. | example=What improved X at Y?
Target: What reduced technical debt at CodeCraft Pro?
Generated: How did you reduce technical debt at CodeCraft Pro?
BLEU: 0.4463, Semantic: 0.9572



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Basket Weaver, WeaveMaster Pro, 2020-2022, Increased sales by 15%. | example=What improved X at Y?
Target: What increased sales at WeaveMaster Pro?
Generated: What increased sales at WeaveMaster Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Safety Coordinator, SafeWork Pro, 2022-2024, Decreased incidents by 18% with training. | example=What improved X at Y?
Target: How did training decrease incidents at SafeWork Pro?
Generated: How did training reduce incidents at SafeWork Pro?
BLEU: 0.5000, Semantic: 0.9844



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Bead Weaver, GemBead Pro, 2020-2022, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at GemBead Pro?
Generated: What grew sales at GemBead Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mathematician, NumSolve Labs, 2019-2021, Solved 10 complex problems. | example=What improved X at Y?
Target: How did you solve problems at NumSolve Labs?
Generated: How did you solve complex problems at NumSolve Labs?
BLEU: 0.5969, Semantic: 0.9164



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Warehouse Supervisor, StockFlow Inc., 2019-2021, Increased efficiency by 30%. | example=What improved X at Y?
Target: What increased efficiency at StockFlow Inc.?
Generated: What increased efficiency at StockFlow Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Software Engineer, CodeForge Inc., 2022-2024, Reduced bug rates by 25% with code reviews. | example=What improved X at Y?
Target: What code reviews reduced bug rates at CodeForge Inc.?
Generated: How did code reviews reduce bug rates at CodeForge Inc.?
BLEU: 0.4518, Semantic: 0.9417



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Carver, ClearCut Pro, 2020-2022, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at ClearCut Pro?
Generated: What grew sales at ClearCut Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Rug Maker, WeftCraft Pro, 2021-2023, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at WeftCraft Pro?
Generated: What grew sales at WeftCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mobile Developer, AppRise Tech, 2020-2022, Increased app downloads by 20% with features. | example=What improved X at Y?
Target: What features increased downloads at AppRise Tech?
Generated: What features increased downloads at AppRise Tech?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Systems Analyst, TechFlow Innovations, 2023-2025, Enhanced system uptime to 99.5%. | example=What improved X at Y?
Target: How did you enhance uptime at TechFlow Innovations?
Generated: How enhanced system uptime at TechFlow Innovations?
BLEU: 0.3768, Semantic: 0.8480



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Operations Analyst, StreamlinePro Inc., 2022-2024, Reduced process delays by 20% with optimization. | example=What improved X at Y?
Target: How did optimization reduce delays at StreamlinePro Inc.?
Generated: How did optimization reduce delays at StreamlinePro Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Metal Engraver, SteelMark Pro, 2021-2023, Grew orders by 10%. | example=What improved X at Y?
Target: What grew orders at SteelMark Pro?
Generated: What grew orders at SteelMark Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Quilt Maker, PatchWork Pro, 2022-2024, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at PatchWork Pro?
Generated: What increased sales at PatchWork Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wildlife Ranger, NatureGuard Pro, 2023-2025, Reduced poaching by 12%. | example=What improved X at Y?
Target: How did you reduce poaching at NatureGuard Pro?
Generated: How did you reduce poaching at NatureGuard Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Lace Maker, ThreadLace Pro, 2023-2025, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at ThreadLace Pro?
Generated: What increased sales at ThreadLace Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Compliance Manager, ReguSafe Inc., 2020-2022, Achieved 95% audit compliance with policies. | example=What improved X at Y?
Target: How did policies ensure compliance at ReguSafe Inc.?
Generated: How did you achieve compliance with policies at ReguSafe Inc.?
BLEU: 0.0000, Semantic: 0.9433



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Leatherworker, HideCraft Pro, 2022-2024, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at HideCraft Pro?
Generated: What increased sales at HideCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mosaic Maker, TileArt Pro, 2022-2024, Boosted sales by 20%. | example=What improved X at Y?
Target: What boosted sales at TileArt Pro?
Generated: What boosted sales at TileArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Tapestry Weaver, ThreadTapestry Pro, 2021-2023, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at ThreadTapestry Pro?
Generated: What boosted sales at ThreadTapestry Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Training Coordinator, SkillUp Corp., 2019-2021, Trained 200+ employees. | example=What improved X at Y?
Target: How did you train employees at SkillUp Corp.?
Generated: How did you train employees at SkillUp Corp.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Candle Designer, WaxDesign Pro, 2020-2022, Increased sales by 12%. | example=What improved X at Y?
Target: What increased sales at WaxDesign Pro?
Generated: What increased sales at WaxDesign Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Bead Weaver, GemBead Pro, 2023-2025, Boosted sales by 20%. | example=What improved X at Y?
Target: What boosted sales at GemBead Pro?
Generated: What boosted sales at GemBead Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Soap Maker, PureGlow Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at PureGlow Pro?
Generated: What grew sales at PureGlow Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Wood Sculptor, TimberSculpt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at TimberSculpt Pro?
Generated: What grew sales at TimberSculpt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Puppeteer, PuppetPlay Pro, 2020-2022, Boosted attendance by 18%. | example=What improved X at Y?
Target: What boosted attendance at PuppetPlay Pro?
Generated: What boosted attendance at PuppetPlay Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Urban Planner, CityBuild Pro, 2020-2022, Improved traffic by 15%. | example=What improved X at Y?
Target: What improved traffic at CityBuild Pro?
Generated: How did you improve traffic at CityBuild Pro?
BLEU: 0.3457, Semantic: 0.9566



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Game Developer, PlayForge Studio, 2020-2022, Released 2 hit games. | example=What improved X at Y?
Target: How did you release games at PlayForge Studio?
Generated: How did you release hit games at PlayForge Studio?
BLEU: 0.5969, Semantic: 0.9478



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Quilt Maker, PatchWork Pro, 2023-2025, Boosted sales by 15%. | example=What improved X at Y?
Target: What boosted sales at PatchWork Pro?
Generated: What boosted sales at PatchWork Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Research Scientist, BioPeak Labs, 2021-2023, Accelerated experiments by 20% with tools. | example=What improved X at Y?
Target: What tools accelerated experiments at BioPeak Labs?
Generated: How did tools accelerate experiments at BioPeak Labs?
BLEU: 0.3656, Semantic: 0.9423



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Basket Weaver, WeaveArt Pro, 2023-2025, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at WeaveArt Pro?
Generated: What grew sales at WeaveArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Candle Maker, GlowLight Pro, 2020-2022, Grew sales by 20%. | example=What improved X at Y?
Target: What grew sales at GlowLight Pro?
Generated: What grew sales at GlowLight Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Paper Artist, FoldArt Pro, 2023-2025, Increased sales by 10%. | example=What improved X at Y?
Target: What increased sales at FoldArt Pro?
Generated: What increased sales at FoldArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Real Estate Agent, HomeQuest Pro, 2022-2024, Increased sales by 25%. | example=What improved X at Y?
Target: What increased sales at HomeQuest Pro?
Generated: What increased sales at HomeQuest Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Business Development Manager, GrowthPulse Pro, 2022-2024, Secured 10 new contracts. | example=What improved X at Y?
Target: What secured contracts at GrowthPulse Pro?
Generated: How did you secure contracts at GrowthPulse Pro?
BLEU: 0.3457, Semantic: 0.9421



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Toy Maker, PlayCraft Pro, 2020-2022, Increased sales by 18%. | example=What improved X at Y?
Target: What increased sales at PlayCraft Pro?
Generated: What increased sales at PlayCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Optometrist, ClearVision Center, 2020-2022, Reduced appointment delays by 15%. | example=What improved X at Y?
Target: What reduced appointment delays at ClearVision Center?
Generated: How did you reduce appointment delays at ClearVision Center?
BLEU: 0.4463, Semantic: 0.9595



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Mobile App Developer, AppTrendz, 2020-2022, Grew user base by 20%. | example=What improved X at Y?
Target: What grew the user base at AppTrendz?
Generated: What grew user base at AppTrendz?
BLEU: 0.5115, Semantic: 0.9938



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glassblower, ClearCraft Pro, 2023-2025, Boosted sales by 18%. | example=What improved X at Y?
Target: What boosted sales at ClearCraft Pro?
Generated: What boosted sales at ClearCraft Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Archivist, HistoryPreserve Pro, 2021-2023, Improved access by 20%. | example=What improved X at Y?
Target: What improved access at HistoryPreserve Pro?
Generated: How did you improve access at HistoryPreserve Pro?
BLEU: 0.3457, Semantic: 0.9459



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Metal Engraver, SteelArt Pro, 2020-2022, Boosted sales by 10%. | example=What improved X at Y?
Target: What boosted sales at SteelArt Pro?
Generated: What boosted sales at SteelArt Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Photographer, LensPro Pro, 2022-2024, Grew clients by 20%. | example=What improved X at Y?
Target: What grew clients at LensPro Pro?
Generated: What grew clients at LensPro Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Content Creator, MediaMix Pro, 2022-2024, Grew audience by 30% with videos. | example=What improved X at Y?
Target: What videos grew the audience at MediaMix Pro?
Generated: What videos grew audience at MediaMix Pro?
BLEU: 0.5154, Semantic: 0.9937



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Systems Architect, TechBuild Corp., 2020-2022, Enhanced system scalability by 15% with redesigns. | example=What improved X at Y?
Target: How did redesigns enhance scalability at TechBuild Corp.?
Generated: How redesigns enhanced system scalability at TechBuild Corp.?
BLEU: 0.3826, Semantic: 0.9075



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Event Photographer, SnapMoment Pro, 2022-2024, Increased bookings by 25%. | example=What improved X at Y?
Target: What increased bookings at SnapMoment Pro?
Generated: What increased bookings at SnapMoment Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Digital Strategist, MediaPulse Agency, 2023-2025, Boosted ad conversions by 30% with analytics. | example=What improved X at Y?
Target: What analytics boosted conversions at MediaPulse Agency?
Generated: What analytics boosted conversions at MediaPulse Agency?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Logistics Manager, FreightMaster Inc., 2021-2023, Reduced shipping costs by $300K annually. | example=What improved X at Y?
Target: How did you reduce shipping costs at FreightMaster Inc.?
Generated: How did you reduce shipping costs at FreightMaster Inc.?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Transportation Planner, MoveEasy Pro, 2021-2023, Reduced fuel costs by 12%. | example=What improved X at Y?
Target: How did you reduce fuel costs at MoveEasy Pro?
Generated: How did you reduce costs at MoveEasy Pro?
BLEU: 0.6102, Semantic: 0.7977



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Veterinary Assistant, PetCare Pro, 2021-2023, Improved recovery by 10%. | example=What improved X at Y?
Target: What improved recovery at PetCare Pro?
Generated: What improved recovery at PetCare Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Glass Blower, ClearWind Pro, 2021-2023, Grew sales by 15%. | example=What improved X at Y?
Target: What grew sales at ClearWind Pro?
Generated: What grew sales at ClearWind Pro?
BLEU: 1.0000, Semantic: 1.0000



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Network Administrator, ConnectPro Systems, 2021-2023, Reduced downtime by 18% with network upgrades. | example=What improved X at Y?
Target: How did upgrades reduce downtime at ConnectPro Systems?
Generated: How network upgrades reduced downtime at ConnectPro Systems?
BLEU: 0.3826, Semantic: 0.9328



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input: generate question | difficulty=hard | type=project | context=CV section: Marine Biologist, OceanWatch Pro, 2022-2024, Protected 10% more species. | example=What improved X at Y?
Target: How did you protect species at OceanWatch Pro?
Generated: How did you protect species at OceanWatch Pro?
BLEU: 1.0000, Semantic: 1.0000

Input: generate question | difficulty=hard | type=project | context=CV section: Childcare Worker, LittleSteps Pro, 2022-2024, Improved development by 12%. | example=What improved X at Y?
Target: How did you improve development at LittleSteps Pro?
Generated: How improved development at LittleSteps Pro?
BLEU: 0.3850, Semantic: 0.9547

T5 Evaluation on 86 samples: Avg BLEU: 0.8019, Avg Semantic Similarity: 0.9762
Checking model directory: /Users/dasunsathsara/VVH/FinalYearProject-SmartHire-Backend/app/answer_evaluation_model
Directory contents: ['model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json', 'vocab.txt']


Map:   0%|          | 0/1686 [00:00<?, ? examples/s]

BERT Test Set Results: {'eval_loss': 0.18401388823986053, 'eval_model_preparation_time': 0.0014, 'eval_accuracy': 0.9704142011834319, 'eval_precision': 0.943502824858757, 'eval_recall': 1.0, 'eval_f1': 0.9709302325581395, 'eval_runtime': 27.0463, 'eval_samples_per_second': 12.497, 'eval_steps_per_second': 1.59}
Accuracy: 0.9704
Precision: 0.9435
Recall: 1.0000
F1 Score: 0.9709


In [2]:
import json
import math
import nltk
import torch
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,  # <-- use seq2seq classes
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')

# ---------------------------
# Device
# ---------------------------
device = torch.device(
    "cuda" if torch.cuda.is_available()
    else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
)
print(f"[QG] Using device: {device}")

# ---------------------------
# Load + dedupe data
# ---------------------------
with open("enhanced_dataset_clean.json", "r") as f:
    raw = json.load(f)

seen = set()
examples = []
for e in raw:
    key = (e.get("input", "").strip(), e.get("output", "").strip())
    if key not in seen and e.get("output", "").strip() and "label" not in e:
        seen.add(key)
        examples.append({"input": e["input"], "target": e["output"]})

print(f"[QG] Unique examples: {len(examples)}")


def make_prompt(entry):
    return (
        "Generate exactly ONE concise interview question from the CV snippet. "
        "Keep it natural (9–20 words) and end with '?'.\n"
        f"Snippet: {entry['input']}\n"
        "Question:"
    )


dataset = Dataset.from_dict({
    "input_text": [make_prompt(e) for e in examples],
    "target_text": [e["target"] for e in examples]
})

# ---------------------------
# Tokenizer + tokenize
# ---------------------------
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)


def tok_fn(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        padding="max_length",
        truncation=True,
        max_length=192
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            padding="max_length",
            truncation=True,
            max_length=40
        )
    # Replace pad in labels with -100 for CE loss masking
    labels_ids = []
    for ids in labels["input_ids"]:
        labels_ids.append([(-100 if t == tokenizer.pad_token_id else t) for t in ids])
    model_inputs["labels"] = labels_ids
    return model_inputs


tokenized = dataset.map(tok_fn, batched=True, remove_columns=["input_text", "target_text"])

# ---------------------------
# Split
# ---------------------------
splits = tokenized.train_test_split(test_size=0.2, seed=42)
train_valid = splits["train"].train_test_split(test_size=0.25, seed=42)
train_ds = train_valid["train"]
val_ds = train_valid["test"]
test_ds = splits["test"]
print(f"[QG] train={len(train_ds)} val={len(val_ds)} test={len(test_ds)}")

# ---------------------------
# Model
# ---------------------------
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Correct T5 dropout knobs:
model.config.dropout_rate = 0.10
model.config.attention_dropout_rate = 0.10
model.to(device)

# ---------------------------
# Trainer
# ---------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(  # <-- seq2seq args
    output_dir="./results_gen",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.06,
    logging_dir="./logs_gen",
    logging_steps=50,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,  # <-- now valid
    generation_max_length=32,  # <-- add generation params
    generation_num_beams=6,
    report_to="none",
    save_safetensors=True
)

trainer = Seq2SeqTrainer(  # <-- seq2seq trainer
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

# ---------------------------
# Save
# ---------------------------
model.save_pretrained("./question_generation_model")
tokenizer.save_pretrained("./question_generation_model")
print("[QG] Saved to ./question_generation_model")


# ---------------------------
# Evaluation (BLEU + semantic)
# ---------------------------
@torch.inference_mode()
def evaluate(val_dataset, samples=200):
    model.eval()
    sem = SentenceTransformer("all-MiniLM-L6-v2")
    subset = val_dataset.shuffle(seed=42).select(range(min(samples, len(val_dataset))))

    total_bleu = 0.0
    total_sim = 0.0
    n = 0

    for ex in subset:
        ids = torch.tensor(ex["input_ids"]).unsqueeze(0).to(device)
        att = torch.tensor(ex["attention_mask"]).unsqueeze(0).to(device)

        # Rebuild target text correctly (replace -100 with pad before decode)
        label_ids = [(tid if tid != -100 else tokenizer.pad_token_id) for tid in ex["labels"]]
        target = tokenizer.decode(label_ids, skip_special_tokens=True)

        out = model.generate(
            input_ids=ids,
            attention_mask=att,
            num_beams=6,  # stable
            max_new_tokens=32,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        gen = tokenizer.decode(out[0], skip_special_tokens=True)

        ref = [target.split()]
        cand = gen.split()
        bleu = sentence_bleu(ref, cand) if cand and ref[0] else 0.0
        s = util.pytorch_cos_sim(
            sem.encode([target], convert_to_tensor=True),
            sem.encode([gen], convert_to_tensor=True),
        ).item()

        if not math.isnan(bleu) and not math.isnan(s):
            total_bleu += bleu
            total_sim += s
            n += 1

    if n == 0:
        print("[QG] No valid eval samples.")
        return 0.0, 0.0

    print(f"[QG] Avg BLEU: {total_bleu / n:.4f} | Avg Semantic: {total_sim / n:.4f}")
    return total_bleu / n, total_sim / n


evaluate(val_ds)


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


[QG] Using device: mps
[QG] Unique examples: 428


Map:   0%|          | 0/428 [00:00<?, ? examples/s]



[QG] train=256 val=86 test=86


  trainer = Seq2SeqTrainer(  # <-- seq2seq trainer
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


[QG] Saved to ./question_generation_model


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[QG] Avg BLEU: 0.7824 | Avg Semantic: 0.9692


(0.7824081357018974, 0.9691774110461391)

In [5]:
# train_answer_evaluation_hardneg.py
import json, numpy as np, torch
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

device = torch.device("cuda" if torch.cuda.is_available()
                      else "mps" if torch.backends.mps.is_available()
                      else "cpu")
print(f"[CLS] device={device}")

# ----- load augmented -----
with open("converted_dataset_hardneg.json","r") as f:
    rows = json.load(f)

texts  = [r["input"] for r in rows]
labels = [1 if r["label"] == "correct" else 0 for r in rows]
ds_all = Dataset.from_dict({"text": texts, "labels": labels})

splits = ds_all.train_test_split(test_size=0.2, seed=42)
train_valid = splits["train"].train_test_split(test_size=0.25, seed=42)
train_ds, val_ds = train_valid["train"], train_valid["test"]
test_ds = splits["test"]
print(f"train={len(train_ds)} val={len(val_ds)} test={len(test_ds)}")

tok = BertTokenizer.from_pretrained("bert-base-uncased")

def tok_fn(ex):
    return tok(ex["text"], padding=False, truncation=True, max_length=512)

train_ds = train_ds.map(tok_fn, batched=True)
val_ds   = val_ds.map(tok_fn, batched=True)
test_ds  = test_ds.map(tok_fn, batched=True)

collator = DataCollatorWithPadding(tokenizer=tok)

y = np.array(train_ds["labels"])
w = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=y)
class_weights = torch.tensor(w, dtype=torch.float).to(device)
print("[CLS] class weights:", w)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.config.id2label = {0:"incorrect",1:"correct"}
model.config.label2id = {"incorrect":0,"correct":1}
model.to(device)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits  = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def metrics_fn(p):
    preds = p.predictions.argmax(axis=1)
    acc = accuracy_score(p.label_ids, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

args = TrainingArguments(
    output_dir="./results_cls_hardneg",
    num_train_epochs=6,                    # start a bit lower; dataset is harder
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    logging_dir="./logs_cls_hardneg",
    logging_steps=50,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    save_safetensors=True
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=metrics_fn
)

trainer.train()

# Save for the API
model.save_pretrained("./answer_evaluation_model")
tok.save_pretrained("./answer_evaluation_model")
print("[CLS] Saved to ./answer_evaluation_model")

# Final test set report
print("[CLS] Test:", trainer.evaluate(test_ds))


[CLS] device=mps
train=1512 val=504 test=504


Map:   0%|          | 0/1512 [00:00<?, ? examples/s]

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

[CLS] class weights: [0.74556213 1.51807229]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.1077,0.36964,0.884921,0.80663,0.863905,0.834286


[CLS] Saved to ./answer_evaluation_model


[CLS] Test: {'eval_loss': 0.3476049304008484, 'eval_accuracy': 0.8809523809523809, 'eval_precision': 0.8021390374331551, 'eval_recall': 0.8670520231213873, 'eval_f1': 0.8333333333333334, 'eval_runtime': 9.599, 'eval_samples_per_second': 52.506, 'eval_steps_per_second': 3.334, 'epoch': 6.0}
