In [None]:
# Install Libraries
!pip install ninja libaio triton transformers datasets accelerate peft bitsandbytes trl sentencepiece deepspeed loralib wandb huggingface_hub git-lfs auto-gptq optimum

In [None]:
# Installing flash-attention
!MAX_JOBS=8 pip install flash-attn --no-build-isolation #2.2 12.1.1

In [None]:
# Make directory required for caching kernels
!mkdir -p /root/.triton/autotune

In [None]:
# Import Libraries
import os
import math
import torch
from trl import SFTTrainer
from datasets import load_dataset
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Set environment variables for stability
torch.backends.cuda.matmul.allow_tf32 = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Model Name (Smaller 3B CodeQwen)
MODEL_NAME = "Qwen/Qwen2.5-Coder-1.5B"

# Load dataset
dataset = load_dataset("json", data_files="verilog_autocomplete.jsonl")["train"].train_test_split(test_size=0.1)

In [None]:
# 4-bit Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load Model with 4-bit Quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)

model.config.use_cache = False
model.config.use_sliding_window_attention = False

# Load tokenizer and set padding to left
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = "left"  # Required for FlashAttention
tokenizer.pad_token = tokenizer.eos_token  # Ensure a valid pad token

In [None]:
# Tokenize dataset
def tokenize_function(examples):
    texts = [str(msg) for msg in examples["messages"]]
    return tokenizer(texts, padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Apply QLoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="qwen_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    eval_strategy="steps",
    eval_steps=2500,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="logs",
    logging_steps=500,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    bf16=True,
    optim="adamw_bnb_8bit",
    warmup_steps=250,
    lr_scheduler_type="cosine",
    save_safetensors=True,
    gradient_checkpointing=False
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_args,
    peft_config=peft_config,
)

In [None]:
# Start training
trainer.train()

# Save fine-tuned model
model.save_pretrained("qwen_finetuned")
tokenizer.save_pretrained("qwen_finetuned")

In [None]:
def calculate_perplexity(model_name, tokenizer_name, dataset_path):
    """Calculates perplexity for a given model on a dataset."""
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Load dataset
    dataset = load_dataset("json", data_files=dataset_path)["train"]

    # Tokenize text
    # Flatten lists of messages into a single text sequence
    messages_text = ["\n".join(msg) if isinstance(msg, list) else msg for msg in dataset["messages"]]
    encodings = tokenizer("\n".join(messages_text), return_tensors="pt", truncation=True, padding=True)

    messages_text = ["\n".join([msg["content"] for msg in messages]) for messages in dataset["messages"]]
    encodings = tokenizer("\n".join(messages_text), return_tensors="pt", truncation=True, padding=True)

    # Compute loss (perplexity)
    with torch.no_grad():
        input_ids = encodings.input_ids.to(device)
        attention_mask = encodings.attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss.item()

    perplexity = math.exp(loss)
    print(f"Model: {model_name} - Perplexity: {perplexity}")
    return perplexity

# Dataset path
dataset_path = "/workspace/verilog_autocomplete.jsonl"

# Define model & tokenizer paths
original_model = "Qwen/Qwen2.5-Coder-1.5B"
original_tokenizer = "Qwen/Qwen2.5-Coder-1.5B"

fine_tuned_model = "qwen_finetuned"
fine_tuned_tokenizer = "qwen_finetuned"

# Calculate perplexity
original_ppl = calculate_perplexity(original_model, original_tokenizer, dataset_path)
fine_tuned_ppl = calculate_perplexity(fine_tuned_model, fine_tuned_tokenizer, dataset_path)

if fine_tuned_ppl < original_ppl:
    print("✅ Fine-tuned model has lower perplexity (better).")
else:
    print("⚠️ Fine-tuned model has higher perplexity (may need more tuning).")