In [4]:
import os
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"


In [12]:
!pip install sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/6f/ff/178f08ea5ebc1f9193d9de7f601efe78c01748347875c8438f66f5cecc19/sentence_transformers-5.0.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-5.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
!pip install transformers datasets torch nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import json
import os
import shutil
import time
import random
import nltk
import torch
import numpy as np
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_seed(42)

# NLTK setup
nltk_data_path = os.path.expanduser("~/nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
try:
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('tokenizers/punkt')
    print("[QG] NLTK resources already downloaded")
except LookupError:
    print("[QG] Downloading NLTK resources...")
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    nltk.download('punkt_tab', download_dir=nltk_data_path, quiet=True)
    print("[QG] NLTK resources downloaded")

# Device setup with MPS handling
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print(f"[QG] Using device: {device}")

# Handle MPS limitations
use_fp16 = torch.cuda.is_available()  # Only use fp16 on CUDA
if device.type == "mps":
    print("[QG] MPS detected - using fp32 and optimized settings")

# Clean output directory
output_dir = "./results_technical_5986"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)

# Check disk space
try:
    total, used, free = shutil.disk_usage(output_dir)
    free_gb = free / (2**30)
    print(f"[QG] Available disk space: {free_gb:.2f} GB")
except Exception as e:
    print(f"[QG] Error checking disk space: {e}")

# Load the expanded dataset
dataset_file = "extended_technical_dataset.json"
with open(dataset_file, "r", encoding='utf-8') as f:
    examples = json.load(f)

print(f"[QG] Loaded {len(examples)} examples from {dataset_file}")

# Enhanced validation for larger dataset
def is_valid_example(example):
    """Robust validation for training examples"""
    if not isinstance(example, dict) or 'input' not in example or 'output' not in example:
        return False

    input_text = str(example['input']).strip()
    output_text = str(example['output']).strip()

    if len(input_text) < 8 or len(output_text) < 8:
        return False

    if not output_text.endswith('?'):
        return False

    if len(input_text) > 600 or len(output_text) > 250:
        return False

    if input_text.lower() == output_text.lower():
        return False

    return True

# Filter examples
valid_examples = [ex for ex in examples if is_valid_example(ex)]
print(f"[QG] Using {len(valid_examples)} valid examples (filtered {len(examples) - len(valid_examples)})")

# Advanced prompt engineering
def create_contextual_prompt(entry):
    """Create highly contextual prompts for better training"""
    input_text = entry['input'].strip()

    if "CV skill:" in input_text:
        skill = input_text.replace("CV skill:", "").strip()
        return f"Create a technical interview question about {skill}:\nQuestion:"

    elif "CV project:" in input_text:
        project = input_text.replace("CV project:", "").strip()
        return f"Generate a project-specific interview question for: {project}\nQuestion:"

    elif "CV work:" in input_text:
        work = input_text.replace("CV work:", "").strip()
        return f"Create an experience-based question about: {work}\nQuestion:"

    elif "CV section:" in input_text:
        section = input_text.replace("CV section:", "").strip()
        return f"Generate a comprehensive question about: {section}\nQuestion:"

    elif "CV achievement:" in input_text:
        achievement = input_text.replace("CV achievement:", "").strip()
        return f"Create a question about this achievement: {achievement}\nQuestion:"

    elif "CV experience:" in input_text:
        exp = input_text.replace("CV experience:", "").strip()
        return f"Generate an {exp} level interview question:\nQuestion:"

    elif "CV education:" in input_text:
        edu = input_text.replace("CV education:", "").strip()
        return f"Create an education-based question about: {edu}\nQuestion:"

    elif "CV certification:" in input_text:
        cert = input_text.replace("CV certification:", "").strip()
        return f"Generate a certification question about: {cert}\nQuestion:"

    else:
        return f"Generate a technical interview question for: {input_text}\nQuestion:"

# Smart data augmentation
def augment_dataset_smart(examples, target_size=4000):
    """Intelligent augmentation to reach target size"""
    current_size = len(examples)
    if current_size >= target_size:
        return examples

    augmented = examples.copy()
    augmentations_needed = target_size - current_size

    skill_examples = [ex for ex in examples if "CV skill:" in ex['input']]
    project_examples = [ex for ex in examples if "CV project:" in ex['input']]
    work_examples = [ex for ex in examples if "CV work:" in ex['input']]

    augmentation_strategies = []

    for ex in skill_examples[:min(200, len(skill_examples))]:
        if random.random() < 0.3:
            variations = [
                ex['input'].replace("CV skill:", "CV skill (Advanced):"),
                ex['input'].replace("CV skill:", "CV skill (Intermediate):"),
                ex['input'].replace("CV skill:", "CV skill (Expert):"),
            ]
            for var in variations:
                if len(augmentation_strategies) < augmentations_needed // 3:
                    augmentation_strategies.append({'input': var, 'output': ex['output']})

    for ex in project_examples[:min(150, len(project_examples))]:
        if random.random() < 0.4:
            contexts = [
                ex['input'] + ", team lead role",
                ex['input'] + ", full-stack development",
                ex['input'] + ", agile methodology"
            ]
            for ctx in contexts:
                if len(augmentation_strategies) < 2 * augmentations_needed // 3:
                    augmentation_strategies.append({'input': ctx, 'output': ex['output']})

    for ex in work_examples[:min(100, len(work_examples))]:
        if random.random() < 0.3:
            exp_vars = [
                ex['input'] + ", technical leadership",
                ex['input'] + ", cross-functional collaboration",
                ex['input'] + ", performance optimization focus"
            ]
            for exp_var in exp_vars:
                if len(augmentation_strategies) < augmentations_needed:
                    augmentation_strategies.append({'input': exp_var, 'output': ex['output']})

    augmented.extend(augmentation_strategies[:augmentations_needed])
    random.shuffle(augmented)

    print(f"[QG] Augmented dataset from {current_size} to {len(augmented)} examples")
    return augmented

# Apply smart augmentation
print("[QG] Applying intelligent data augmentation...")
augmented_examples = augment_dataset_smart(valid_examples, target_size=4000)

# Create dataset
dataset = Dataset.from_dict({
    "input_text": [create_contextual_prompt(e) for e in augmented_examples],
    "target_text": [e['output'] for e in augmented_examples]
})

# Load the model
model_name = "google/flan-t5-base"
print(f"[QG] Loading model: {model_name}")

try:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map=None
    )

    special_tokens = ["<TECHNICAL>", "<SKILL>", "<PROJECT>", "<EXPERIENCE>", "<ACHIEVEMENT>"]
    tokenizer.add_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))

    model.gradient_checkpointing_enable()
    print("[QG] Gradient checkpointing enabled")

    model = model.to(device)
    print(f"[QG] Model loaded successfully on {device}")

except Exception as e:
    print(f"[QG] Error loading model: {e}")
    raise

# Enhanced preprocessing
def preprocess_function(examples):
    """Enhanced preprocessing for larger dataset"""
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding=False
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=32,
            truncation=True,
            padding=False
        )

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process dataset
print("[QG] Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing"
)

# Balanced splits
def create_balanced_splits(dataset, train_ratio=0.75, val_ratio=0.15):
    """Create balanced splits ensuring good distribution"""
    train_test = dataset.train_test_split(test_size=(1-train_ratio), seed=42)
    val_test = train_test['test'].train_test_split(test_size=(1-train_ratio-val_ratio)/(1-train_ratio), seed=42)

    return {
        'train': train_test['train'],
        'validation': val_test['train'],
        'test': val_test['test']
    }

splits = create_balanced_splits(tokenized_dataset)
train_dataset = splits['train']
val_dataset = splits['validation']
test_dataset = splits['test']

print(f"[QG] Dataset splits: train={len(train_dataset)}, val={len(val_dataset)}, test={len(test_dataset)}")

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=128,
    label_pad_token_id=-100
)

# FIXED: Enhanced metrics computation with proper tensor handling
def compute_comprehensive_metrics(eval_pred):
    """Comprehensive metrics with better error handling and MPS compatibility"""
    predictions, labels = eval_pred

    # CRITICAL FIX: Convert tensors to CPU numpy arrays properly
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Handle nested lists/arrays
    if isinstance(predictions, (list, tuple)):
        predictions = np.array([pred.cpu().numpy() if isinstance(pred, torch.Tensor) else pred for pred in predictions])
    if isinstance(labels, (list, tuple)):
        labels = np.array([label.cpu().numpy() if isinstance(label, torch.Tensor) else label for label in labels])

    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    try:
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except Exception as e:
        print(f"[QG] Decoding error: {e}")
        return {"bleu": 0.0, "semantic_similarity": 0.0, "rouge1": 0.0, "question_validity": 0.0}

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # BLEU Score
    bleu_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        try:
            if pred and label:
                pred_tokens = word_tokenize(pred.lower())
                label_tokens = word_tokenize(label.lower())
                if pred_tokens and label_tokens:
                    bleu = sentence_bleu([label_tokens], pred_tokens)
                else:
                    bleu = 0.0
            else:
                bleu = 0.0
        except Exception:
            bleu = 0.0
        bleu_scores.append(bleu)

    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0

    # Semantic Similarity with memory management
    try:
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        pred_embeddings = sentence_model.encode(decoded_preds, batch_size=16)  # Reduced batch size
        label_embeddings = sentence_model.encode(decoded_labels, batch_size=16)
        similarities = util.cos_sim(pred_embeddings, label_embeddings)
        avg_similarity = similarities.diagonal().mean().item()
        del sentence_model  # Free memory
        if device.type == "mps":
            torch.mps.empty_cache()
    except Exception as e:
        print(f"[QG] Similarity error: {e}")
        avg_similarity = 0.0

    # ROUGE Score
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge1_scores = []
        rougeL_scores = []

        for pred, label in zip(decoded_preds, decoded_labels):
            if pred.strip() and label.strip():
                score = scorer.score(label, pred)
                rouge1_scores.append(score['rouge1'].fmeasure)
                rougeL_scores.append(score['rougeL'].fmeasure)
            else:
                rouge1_scores.append(0.0)
                rougeL_scores.append(0.0)

        avg_rouge1 = np.mean(rouge1_scores)
        avg_rougeL = np.mean(rougeL_scores)
    except Exception as e:
        print(f"[QG] ROUGE error: {e}")
        avg_rouge1 = avg_rougeL = 0.0

    # Question validity
    valid_questions = sum(1 for pred in decoded_preds if pred.strip().endswith('?'))
    question_validity = valid_questions / len(decoded_preds) if decoded_preds else 0.0

    return {
        "bleu": avg_bleu,
        "semantic_similarity": avg_similarity,
        "rouge1": avg_rouge1,
        "rougeL": avg_rougeL,
        "question_validity": question_validity
    }

# FIXED: Custom prediction function with proper tensor handling
def safe_predict_and_evaluate(trainer, dataset):
    """Safe prediction and evaluation with proper MPS handling"""
    try:
        # Use trainer's built-in predict method
        results = trainer.predict(dataset)

        # Extract and properly handle predictions and labels
        predictions = results.predictions
        labels = results.label_ids

        # Convert to CPU if needed
        if isinstance(predictions, torch.Tensor):
            predictions = predictions.cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.cpu().numpy()

        # Compute metrics
        metrics = compute_comprehensive_metrics((predictions, labels))

        # Clear cache if MPS
        if device.type == "mps":
            torch.mps.empty_cache()

        return metrics, results

    except Exception as e:
        print(f"[QG] Prediction error: {e}")
        import traceback
        traceback.print_exc()
        return {
            "bleu": 0.0,
            "semantic_similarity": 0.0,
            "rouge1": 0.0,
            "rougeL": 0.0,
            "question_validity": 0.0
        }, None

# Optimized training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=15,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_ratio=0.15,
    lr_scheduler_type="cosine",
    logging_strategy="steps",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=400,
    save_strategy="steps",
    save_steps=800,
    save_total_limit=4,
    load_best_model_at_end=True,
    metric_for_best_model="eval_semantic_similarity",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=32,
    generation_num_beams=4,
    report_to="none",
    save_safetensors=True,
    fp16=False,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    ignore_data_skip=True,
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_comprehensive_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

# Training with progress tracking
def train_enhanced_model():
    """Train with enhanced monitoring"""
    try:
        print(f"[QG] Starting enhanced training at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"[QG] Model: {model_name}")
        print(f"[QG] Training examples: {len(train_dataset)}")
        print(f"[QG] Validation examples: {len(val_dataset)}")
        print(f"[QG] Expected training time: 1-2 hours")

        start_time = time.time()

        train_result = trainer.train()

        end_time = time.time()
        training_time = end_time - start_time
        hours, remainder = divmod(training_time, 3600)
        minutes, seconds = divmod(remainder, 60)

        print(f"[QG] Training completed in {int(hours)}h {int(minutes)}m {int(seconds)}s")
        print(f"[QG] Final training loss: {train_result.training_loss:.4f}")

        if device.type == "mps":
            torch.mps.empty_cache()
        return True, training_time

    except Exception as e:
        print(f"[QG] Training failed: {e}")
        import traceback
        traceback.print_exc()
        if device.type == "mps":
            torch.mps.empty_cache()
        return False, 0

# Execute training
success, training_time = train_enhanced_model()

if success:
    final_model_path = "./technical_qg_enhanced_5986"
    print(f"[QG] Saving model to {final_model_path}")

    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)

    print("[QG] Running comprehensive evaluation...")

    # FIXED: Use safe prediction method
    test_metrics, test_results = safe_predict_and_evaluate(trainer, test_dataset)

    print("\n[QG] === FINAL RESULTS ===")
    for key, value in test_metrics.items():
        print(f"  {key.upper()}: {value:.4f}")

    # Save results
    results_summary = {
        "model_name": model_name,
        "dataset_size": len(augmented_examples),
        "training_time": training_time,
        "metrics": test_metrics,
        "training_config": {
            "epochs": training_args.num_train_epochs,
            "batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "max_length": 128
        }
    }

    with open(f"{final_model_path}/training_summary.json", "w") as f:
        json.dump(results_summary, f, indent=2)

    # Quality assessment
    expected_bleu = 0.15
    expected_similarity = 0.45

    actual_bleu = test_metrics.get('bleu', 0)
    actual_similarity = test_metrics.get('semantic_similarity', 0)

    print(f"\n[QG] === QUALITY ASSESSMENT ===")
    print(f"BLEU Score: {actual_bleu:.4f} (target: >{expected_bleu:.3f}) {'✓' if actual_bleu > expected_bleu else '✗'}")
    print(f"Semantic Similarity: {actual_similarity:.4f} (target: >{expected_similarity:.3f}) {'✓' if actual_similarity > expected_similarity else '✗'}")

    if actual_bleu > expected_bleu and actual_similarity > expected_similarity:
        print("🎉 Model meets quality targets for demo/prototype use!")
    else:
        print("⚠️  Model shows improvement but may need further training for optimal results")

    print(f"\n[QG] Model saved to: {final_model_path}")

else:
    print("[QG] Training failed. Check error messages above.")

# Final cleanup
if device.type == "mps":
    torch.mps.empty_cache()
print("[QG] Training script completed!")

[QG] NLTK resources already downloaded
[QG] Using device: mps
[QG] MPS detected - using fp32 and optimized settings
[QG] Available disk space: 47.95 GB
[QG] Loaded 5986 examples from extended_technical_dataset.json
[QG] Using 5985 valid examples (filtered 1)
[QG] Applying intelligent data augmentation...
[QG] Loading model: google/flan-t5-base
[QG] Gradient checkpointing enabled
[QG] Model loaded successfully on mps
[QG] Tokenizing dataset...


Tokenizing:   0%|          | 0/5985 [00:00<?, ? examples/s]



[QG] Dataset splits: train=4488, val=898, test=599
[QG] Starting enhanced training at 2025-09-06 16:02:08
[QG] Model: google/flan-t5-base
[QG] Training examples: 4488
[QG] Validation examples: 898
[QG] Expected training time: 1-2 hours


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss,Bleu,Semantic Similarity,Rouge1,Rougel,Question Validity
400,11.9122,2.541435,0.00464,0.474487,0.229764,0.228458,1.0
800,9.3008,1.950335,0.030655,0.484423,0.270777,0.236885,1.0
1200,8.0238,1.558015,0.030917,0.485982,0.264527,0.227127,1.0
1600,7.8923,1.391191,0.048163,0.464462,0.251845,0.236619,1.0
2000,6.9592,1.319467,0.047574,0.462591,0.24634,0.231721,1.0
2400,6.873,1.285872,0.042756,0.453392,0.238797,0.223742,1.0
2800,6.5694,1.258387,0.044933,0.458945,0.237846,0.224225,1.0
3200,6.3238,1.244298,0.039558,0.459723,0.224239,0.211545,1.0
3600,6.6311,1.226076,0.044343,0.459009,0.238892,0.22522,1.0


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

[QG] Training completed in 2h 34m 49s
[QG] Final training loss: 8.0599
[QG] Saving model to ./technical_qg_enhanced_5986
[QG] Running comprehensive evaluation...




Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr


[QG] === FINAL RESULTS ===
  BLEU: 0.0262
  SEMANTIC_SIMILARITY: 0.4882
  ROUGE1: 0.2640
  ROUGEL: 0.2290
  QUESTION_VALIDITY: 1.0000

[QG] === QUALITY ASSESSMENT ===
BLEU Score: 0.0262 (target: >0.150) ✗
Semantic Similarity: 0.4882 (target: >0.450) ✓
⚠️  Model shows improvement but may need further training for optimal results

[QG] Model saved to: ./technical_qg_enhanced_5986
[QG] Training script completed!


In [3]:
import json
import os
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import nltk
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# Set seeds for reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_seed(42)

# NLTK setup
nltk_data_path = os.path.expanduser("~/nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
try:
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('tokenizers/punkt')
    print("[QG-Test] NLTK resources already downloaded")
except LookupError:
    print("[QG-Test] Downloading NLTK resources...")
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    nltk.download('punkt_tab', download_dir=nltk_data_path, quiet=True)
    print("[QG-Test] NLTK resources downloaded")

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print(f"[QG-Test] Using device: {device}")

# Load model and tokenizer
model_path = "./technical_qg_enhanced_5986"
try:
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float32,
        device_map=None
    ).to(device)
    print(f"[QG-Test] Model and tokenizer loaded from {model_path}")
except Exception as e:
    print(f"[QG-Test] Error loading model: {e}")
    raise

# Sample test inputs (replace with your own test data)
test_examples = [
    {
        "input": "CV skill: Python programming",
        "reference": "What are the key differences between Python 2 and Python 3?"
    },
    {
        "input": "CV project: Developed a machine learning model for sentiment analysis",
        "reference": "Can you explain the architecture of the sentiment analysis model you developed?"
    },
    {
        "input": "CV work: Led a team in developing a RESTful API",
        "reference": "How did you ensure the scalability and security of the RESTful API you developed?"
    }
]

# Create contextual prompts (same as training)
def create_contextual_prompt(entry):
    input_text = entry['input'].strip()
    if "CV skill:" in input_text:
        skill = input_text.replace("CV skill:", "").strip()
        return f"Create a technical interview question about {skill}:\nQuestion:"
    elif "CV project:" in input_text:
        project = input_text.replace("CV project:", "").strip()
        return f"Generate a project-specific interview question for: {project}\nQuestion:"
    elif "CV work:" in input_text:
        work = input_text.replace("CV work:", "").strip()
        return f"Create an experience-based question about: {work}\nQuestion:"
    else:
        return f"Generate a technical interview question for: {input_text}\nQuestion:"

# Preprocess inputs
test_inputs = [create_contextual_prompt(ex) for ex in test_examples]
test_references = [ex['reference'] for ex in test_examples]

# Tokenize inputs
inputs = tokenizer(
    test_inputs,
    max_length=128,
    truncation=True,
    padding=True,
    return_tensors="pt"
).to(device)

# Generate questions
model.eval()
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=32,
        num_beams=4,
        length_penalty=1.0,
        early_stopping=True
    )

# Decode generated outputs
generated_questions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
generated_questions = [q.strip() for q in generated_questions]

# Compute comprehensive metrics
def compute_comprehensive_metrics(predictions, references):
    try:
        # BLEU Score
        bleu_scores = []
        for pred, ref in zip(predictions, references):
            if pred and ref:
                pred_tokens = word_tokenize(pred.lower())
                ref_tokens = word_tokenize(ref.lower())
                bleu = sentence_bleu([ref_tokens], pred_tokens) if pred_tokens and ref_tokens else 0.0
            else:
                bleu = 0.0
            bleu_scores.append(bleu)
        avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0

        # Semantic Similarity
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        pred_embeddings = sentence_model.encode(predictions, batch_size=16)
        ref_embeddings = sentence_model.encode(references, batch_size=16)
        similarities = util.cos_sim(pred_embeddings, ref_embeddings)
        avg_similarity = similarities.diagonal().mean().item()
        del sentence_model
        if device.type == "mps":
            torch.mps.empty_cache()

        # ROUGE Score
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge1_scores = []
        rougeL_scores = []
        for pred, ref in zip(predictions, references):
            if pred.strip() and ref.strip():
                score = scorer.score(ref, pred)
                rouge1_scores.append(score['rouge1'].fmeasure)
                rougeL_scores.append(score['rougeL'].fmeasure)
            else:
                rouge1_scores.append(0.0)
                rougeL_scores.append(0.0)
        avg_rouge1 = np.mean(rouge1_scores)
        avg_rougeL = np.mean(rougeL_scores)

        # Question Validity
        valid_questions = sum(1 for pred in predictions if pred.strip().endswith('?'))
        question_validity = valid_questions / len(predictions) if predictions else 0.0

        return {
            "bleu": avg_bleu,
            "semantic_similarity": avg_similarity,
            "rouge1": avg_rouge1,
            "rougeL": avg_rougeL,
            "question_validity": question_validity
        }
    except Exception as e:
        print(f"[QG-Test] Metric computation error: {e}")
        return {
            "bleu": 0.0,
            "semantic_similarity": 0.0,
            "rouge1": 0.0,
            "rougeL": 0.0,
            "question_validity": 0.0
        }

# Evaluate generated questions
metrics = compute_comprehensive_metrics(generated_questions, test_references)

# Print results
print("\n[QG-Test] === GENERATED QUESTIONS ===")
for i, (input_text, gen_q, ref_q) in enumerate(zip(test_inputs, generated_questions, test_references)):
    print(f"\nTest Example {i+1}:")
    print(f"Input: {input_text}")
    print(f"Generated Question: {gen_q}")
    print(f"Reference Question: {ref_q}")

print("\n[QG-Test] === TEST METRICS ===")
for key, value in metrics.items():
    print(f"  {key.upper()}: {value:.4f}")

# Save results
output_dir = "./technical_qg_test_results"
os.makedirs(output_dir, exist_ok=True)
results_summary = {
    "model_path": model_path,
    "test_examples": [
        {
            "input": ex["input"],
            "generated_question": gen_q,
            "reference_question": ex["reference"]
        } for ex, gen_q in zip(test_examples, generated_questions)
    ],
    "metrics": metrics
}
with open(f"{output_dir}/test_summary.json", "w") as f:
    json.dump(results_summary, f, indent=2)
print(f"[QG-Test] Results saved to {output_dir}/test_summary.json")

# Clean up
if device.type == "mps":
    torch.mps.empty_cache()
print("[QG-Test] Testing completed!")

[QG-Test] NLTK resources already downloaded
[QG-Test] Using device: mps
[QG-Test] Model and tokenizer loaded from ./technical_qg_enhanced_5986

[QG-Test] === GENERATED QUESTIONS ===

Test Example 1:
Input: Create a technical interview question about Python programming:
Question:
Generated Question: How would you implement a python script in a production environment?
Reference Question: What are the key differences between Python 2 and Python 3?

Test Example 2:
Input: Generate a project-specific interview question for: Developed a machine learning model for sentiment analysis
Question:
Generated Question: How did you implement sentiment analysis?
Reference Question: Can you explain the architecture of the sentiment analysis model you developed?

Test Example 3:
Input: Create an experience-based question about: Led a team in developing a RESTful API
Question:
Generated Question: What is RESTful API and how does it work?
Reference Question: How did you ensure the scalability and security

In [7]:
import json
import os
import random
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import nltk
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from tqdm import tqdm

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

set_seed(42)

# NLTK setup
nltk_data_path = os.path.expanduser("~/nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
try:
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('tokenizers/punkt')
    print("[QG-Test] NLTK resources already downloaded")
except LookupError:
    print("[QG-Test] Downloading NLTK resources...")
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    nltk.download('punkt_tab', download_dir=nltk_data_path, quiet=True)
    print("[QG-Test] NLTK resources downloaded")

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print(f"[QG-Test] Using device: {device}")

# Load model and tokenizer
model_path = "./technical_qg_enhanced_5986"
try:
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float32,
        device_map=None
    ).to(device)
    print(f"[QG-Test] Model and tokenizer loaded from {model_path}")
except Exception as e:
    print(f"[QG-Test] Error loading model: {e}")
    raise

# Load the dataset
dataset_file = "extended_technical_dataset.json"
try:
    with open(dataset_file, "r", encoding='utf-8') as f:
        examples = json.load(f)
    print(f"[QG-Test] Loaded {len(examples)} examples from {dataset_file}")
except Exception as e:
    print(f"[QG-Test] Error loading dataset: {e}")
    raise

# Validation function (from training script)
def is_valid_example(example):
    if not isinstance(example, dict) or 'input' not in example or 'output' not in example:
        return False
    input_text = str(example['input']).strip()
    output_text = str(example['output']).strip()
    if len(input_text) < 8 or len(output_text) < 8:
        return False
    if not output_text.endswith('?'):
        return False
    if len(input_text) > 600 or len(output_text) > 250:
        return False
    if input_text.lower() == output_text.lower():
        return False
    return True

# Filter valid examples
valid_examples = [ex for ex in examples if is_valid_example(ex)]
print(f"[QG-Test] Using {len(valid_examples)} valid examples (filtered {len(examples) - len(valid_examples)})")

# Select a test set (e.g., 100 random examples for testing)
test_size = min(100, len(valid_examples))  # Adjustable; use 100 for a good sample
test_examples = random.sample(valid_examples, test_size)
print(f"[QG-Test] Selected {test_size} test examples")

# Create contextual prompts (from training script)
def create_contextual_prompt(entry):
    input_text = entry['input'].strip()
    if "CV skill:" in input_text:
        skill = input_text.replace("CV skill:", "").strip()
        return f"Create a technical interview question about {skill}:\nQuestion:"
    elif "CV project:" in input_text:
        project = input_text.replace("CV project:", "").strip()
        return f"Generate a project-specific interview question for: {project}\nQuestion:"
    elif "CV work:" in input_text:
        work = input_text.replace("CV work:", "").strip()
        return f"Create an experience-based question about: {work}\nQuestion:"
    elif "CV section:" in input_text:
        section = input_text.replace("CV section:", "").strip()
        return f"Generate a comprehensive question about: {section}\nQuestion:"
    elif "CV achievement:" in input_text:
        achievement = input_text.replace("CV achievement:", "").strip()
        return f"Create a question about this achievement: {achievement}\nQuestion:"
    elif "CV experience:" in input_text:
        exp = input_text.replace("CV experience:", "").strip()
        return f"Generate an {exp} level interview question:\nQuestion:"
    elif "CV education:" in input_text:
        edu = input_text.replace("CV education:", "").strip()
        return f"Create an education-based question about: {edu}\nQuestion:"
    elif "CV certification:" in input_text:
        cert = input_text.replace("CV certification:", "").strip()
        return f"Generate a certification question about: {cert}\nQuestion:"
    else:
        return f"Generate a technical interview question for: {input_text}\nQuestion:"

# Preprocess test inputs
test_inputs = [create_contextual_prompt(ex) for ex in test_examples]
test_references = [ex['output'] for ex in test_examples]

# Batch generation (for efficiency)
batch_size = 8  # Adjustable based on device memory
generated_questions = []

model.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(test_inputs), batch_size), desc="[QG-Test] Generating questions"):
        batch_inputs = test_inputs[i:i+batch_size]
        inputs = tokenizer(
            batch_inputs,
            max_length=128,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=32,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=True
        )
        batch_generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        generated_questions.extend([q.strip() for q in batch_generated])
        if device.type == "mps":
            torch.mps.empty_cache()

# Compute comprehensive metrics (from training script, adapted)
def compute_comprehensive_metrics(predictions, references):
    try:
        # BLEU Score
        bleu_scores = []
        for pred, ref in zip(predictions, references):
            if pred and ref:
                pred_tokens = word_tokenize(pred.lower())
                ref_tokens = word_tokenize(ref.lower())
                bleu = sentence_bleu([ref_tokens], pred_tokens) if pred_tokens and ref_tokens else 0.0
            else:
                bleu = 0.0
            bleu_scores.append(bleu)
        avg_bleu = np.mean(bleu_scores) if bleu_scores else 0.0

        # Semantic Similarity
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        pred_embeddings = sentence_model.encode(predictions, batch_size=16)
        ref_embeddings = sentence_model.encode(references, batch_size=16)
        similarities = util.cos_sim(pred_embeddings, ref_embeddings)
        avg_similarity = similarities.diagonal().mean().item()
        del sentence_model
        if device.type == "mps":
            torch.mps.empty_cache()

        # ROUGE Score
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge1_scores = []
        rougeL_scores = []
        for pred, ref in zip(predictions, references):
            if pred.strip() and ref.strip():
                score = scorer.score(ref, pred)
                rouge1_scores.append(score['rouge1'].fmeasure)
                rougeL_scores.append(score['rougeL'].fmeasure)
            else:
                rouge1_scores.append(0.0)
                rougeL_scores.append(0.0)
        avg_rouge1 = np.mean(rouge1_scores)
        avg_rougeL = np.mean(rougeL_scores)

        # Question Validity
        valid_questions = sum(1 for pred in predictions if pred.strip().endswith('?'))
        question_validity = valid_questions / len(predictions) if predictions else 0.0

        return {
            "bleu": avg_bleu,
            "semantic_similarity": avg_similarity,
            "rouge1": avg_rouge1,
            "rougeL": avg_rougeL,
            "question_validity": question_validity
        }
    except Exception as e:
        print(f"[QG-Test] Metric computation error: {e}")
        return {
            "bleu": 0.0,
            "semantic_similarity": 0.0,
            "rouge1": 0.0,
            "rougeL": 0.0,
            "question_validity": 0.0
        }

# Evaluate
metrics = compute_comprehensive_metrics(generated_questions, test_references)

# Print sample results (first 10)
print("\n[QG-Test] === SAMPLE GENERATED QUESTIONS (First 10) ===")
for i in range(min(10, len(test_examples))):
    print(f"\nTest Example {i+1}:")
    print(f"Input: {test_inputs[i]}")
    print(f"Generated Question: {generated_questions[i]}")
    print(f"Reference Question: {test_references[i]}")

# Print metrics
print("\n[QG-Test] === AVERAGE TEST METRICS ===")
for key, value in metrics.items():
    print(f"  {key.upper()}: {value:.4f}")

# Save full results
output_dir = "./technical_qg_test_results"
os.makedirs(output_dir, exist_ok=True)
results_summary = {
    "model_path": model_path,
    "dataset_file": dataset_file,
    "test_size": test_size,
    "test_examples": [
        {
            "input": test_inputs[j],
            "generated_question": generated_questions[j],
            "reference_question": test_references[j]
        } for j in range(len(test_examples))
    ],
    "metrics": metrics
}
with open(f"{output_dir}/test_summary.json", "w") as f:
    json.dump(results_summary, f, indent=2)
print(f"[QG-Test] Full results saved to {output_dir}/test_summary.json")

# Clean up
if device.type == "mps":
    torch.mps.empty_cache()
print("[QG-Test] Testing completed!")

[QG-Test] NLTK resources already downloaded
[QG-Test] Using device: mps
[QG-Test] Model and tokenizer loaded from ./technical_qg_enhanced_5986
[QG-Test] Loaded 5986 examples from extended_technical_dataset.json
[QG-Test] Using 5985 valid examples (filtered 1)
[QG-Test] Selected 100 test examples


[QG-Test] Generating questions: 100%|██████████| 13/13 [00:22<00:00,  1.70s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



[QG-Test] === SAMPLE GENERATED QUESTIONS (First 10) ===

Test Example 1:
Input: Create a technical interview question about OCaml:
Question:
Generated Question: What is OCaml and how does it work in practice?
Reference Question: What is OCaml and where does it shine in production systems?

Test Example 2:
Input: Create a technical interview question about Java:
Question:
Generated Question: How would you implement Java in a production environment?
Reference Question: What is the Java Native Interface (JNI)?

Test Example 3:
Input: Create a technical interview question about Flask:
Question:
Generated Question: What is Flask and how does it work in practice?
Reference Question: How do you implement templates?

Test Example 4:
Input: Generate a comprehensive question about: Mobile Development with Swift and Kotlin
Question:
Generated Question: What are Swift and Kotlin?
Reference Question: What are app distribution methods?

Test Example 5:
Input: Create a technical interview question a