In [None]:
!pip install -q transformers torch peft bitsandbytes datasets tqdm pandas
import torch
import pandas as pd
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"

print(f"âœ… GPU: {torch.cuda.get_device_name(0)}")
print(f"âœ… VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


âœ… GPU: NVIDIA A100-SXM4-40GB
âœ… VRAM: 42.4 GB


In [None]:
DATA_CSV = "adobe/output/instructions_optimized.csv"
OUTPUT_DIR = "lora"

os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.read_csv(DATA_CSV)

print(f"âœ… Data loaded: {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")
print(f"\nSample:")
print(f"Instruction: {df['instruction'].iloc[0][:80]}...")
print(f"Response: {df['response'].iloc[0][:80]}...")
from huggingface_hub import notebook_login

notebook_login()

âœ… Data loaded: 13864 samples
Columns: ['instruction', 'response']

Sample:
Instruction: Generate tweet for mcafee (@McAfee) targeting 2 likes:
No media

Tweet:...
Response: To give our worldwide customers better #cybersecurity, we've extended our partne...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import torch
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
model = AutoModelForCausalLM.from_pretrained(
    'mistralai/Mistral-7B-Instruct-v0.2',
    quantization_config=bnb_config,
    device_map="auto",
)
print("âœ… Both loaded")
model.gradient_checkpointing_enable()
print(f"âœ… Model loaded")
print(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")



config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

âœ… Both loaded
âœ… Model loaded
GPU Memory: 4.45 GB


In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
print("Setting up LoRA...")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("âœ… LoRA applied with gradient checkpointing")

Setting up LoRA...
trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
âœ… LoRA applied with gradient checkpointing


In [None]:
from datasets import Dataset
print("Preparing dataset...")
texts = []
for idx, row in df.iterrows():
    instruction = str(row['instruction'])
    response = str(row['response'])
    text = f"[INST] {instruction} [/INST] {response}</s>"
    texts.append(text)
raw_dataset = Dataset.from_dict({"text": texts})
print(f"Dataset size: {len(raw_dataset)}")
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

# Apply tokenization
tokenized_dataset = raw_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing..."
)

# Split data
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"âœ… Dataset prepared:")
print(f"   Train: {len(train_dataset)}")
print(f"   Val: {len(eval_dataset)}")

# Verify batch structure
sample_example = {k: torch.tensor(v[0]) for k, v in train_dataset[:1].items()}

print(f"\nSample example keys: {sample_example.keys()}")
print(f"input_ids shape: {sample_example['input_ids'].shape}")
print(f"labels shape: {sample_example['labels'].shape}")
print(f"âœ… Labels present on single example!")


Preparing dataset...
Dataset size: 13864


Tokenizing...:   0%|          | 0/13864 [00:00<?, ? examples/s]

âœ… Dataset prepared:
   Train: 12477
   Val: 1387

Sample example keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([512])
labels shape: torch.Size([512])
âœ… Labels present on single example!


In [None]:
from transformers import TrainingArguments

print("Setting up training arguments...")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=2,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    report_to=[],
)

print("âœ… Training args ready")



Setting up training arguments...
âœ… Training args ready


In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
print("Creating trainer...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
train_loader = DataLoader(
    train_dataset, batch_size = 32, collate_fn = data_collator)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("âœ… Trainer created successfully")
print("\nVerifying batch format...")
batch = data_collator([train_dataset[i] for i in range(2)])
print(f"Batch keys: {batch.keys()}")
print(f"input_ids shape: {batch['input_ids'].shape}")
print(f"labels shape: {batch['labels'].shape if 'labels' in batch else 'NOT PRESENT'}")
print(f"âœ… Batch format correct!")



Creating trainer...
âœ… Trainer created successfully

Verifying batch format...
Batch keys: KeysView({'input_ids': tensor([[    2,     2,     2,  ...,  2403, 28767,     2],
        [    2,     2,     2,  ...,  2403, 28767,     2]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  2403, 28767,  -100],
        [ -100,  -100,  -100,  ...,  2403, 28767,  -100]])})
input_ids shape: torch.Size([2, 512])
labels shape: torch.Size([2, 512])
âœ… Batch format correct!


In [None]:
import time

print("="*70)
print("ðŸš€ STARTING TRAINING")
print("="*70)

start_time = time.time()

trainer.train()

elapsed = (time.time() - start_time) / 3600

print("\n" + "="*70)
print("âœ… TRAINING COMPLETE")
print("="*70)
print(f"Training time: {elapsed:.2f} hours")


ðŸš€ STARTING TRAINING


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
200,1.4368,1.439679
400,1.393,1.401593
600,1.322,1.386197



âœ… TRAINING COMPLETE
Training time: 1.37 hours


In [None]:
print("Saving model...")
final_model_dir = os.path.join(OUTPUT_DIR, "final_model")
os.makedirs(final_model_dir, exist_ok=True)
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

print(f"âœ… Model saved to: {final_model_dir}")
import os
for file in os.listdir(final_model_dir):
    size = os.path.getsize(os.path.join(final_model_dir, file)) / 1e6
    print(f"   {file} ({size:.1f} MB)")


Saving model...
âœ… Model saved to: lora/final_model
   README.md (0.0 MB)
   chat_template.jinja (0.0 MB)
   tokenizer_config.json (0.0 MB)
   tokenizer.json (3.5 MB)
   special_tokens_map.json (0.0 MB)
   adapter_config.json (0.0 MB)
   adapter_model.safetensors (27.3 MB)


In [None]:
from peft import AutoPeftModelForCausalLM
model_path = os.path.join(OUTPUT_DIR, "final_model")
model = AutoPeftModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

print("âœ… Fine-tuned model loaded")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

âœ… Fine-tuned model loaded


In [None]:
from tqdm import tqdm
from transformers import pipeline
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

print("Creating generation pipeline...")

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=400,
    num_beams=1,
    temperature=0.7,
)
print(f"Generating predictions for {len(eval_dataset)} samples...")

generated_text = []

for i in tqdm(range(min(20, len(eval_dataset)))):
    sample = eval_dataset[i]
    prompt_ids = sample['input_ids'][:50]  # Take first 50 tokens as prompt
    prompt = tokenizer.decode(prompt_ids, skip_special_tokens=True)

    try:
        result = pipe(prompt, num_return_sequences=1)
        generated = result[0]['generated_text'].split(prompt)[-1].strip()
        generated_text.append(generated)
    except Exception as e:
        generated_text.append("")

print(f"âœ… Generated {len(generated_text)} predictions")


Creating generation pipeline...
Generating predictions for 1387 samples...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [06:15<00:00, 18.78s/it]

âœ… Generated 20 predictions





In [None]:
instructions = []
references = []

for i in range(min(20, len(eval_dataset))):
    sample = eval_dataset[i]
    instruction_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
    reference_text = tokenizer.decode(sample['labels'], skip_special_tokens=True)
    instructions.append(instruction_text)
    references.append(reference_text)

results_df = pd.DataFrame({
    'instruction': instructions,
    'reference': references,
    'generated': generated_text,
})

print(f"Results shape: {results_df.shape}")
results_df.head()

Results shape: (20, 3)


Unnamed: 0,instruction,reference,generated
0,[INST] Generate tweet for emerson (@AnneTEmers...,[INST] Generate tweet for emerson (@AnneTEmers...,
1,[INST] Generate tweet for independent (@Indepe...,[INST] Generate tweet for independent (@Indepe...,
2,[INST] Generate tweet for aaa (@AAASouthPenn) ...,[INST] Generate tweet for aaa (@AAASouthPenn) ...,
3,[INST] Generate tweet for williams (@Rtreatwil...,[INST] Generate tweet for williams (@Rtreatwil...,
4,[INST] Generate tweet for hp (@HP) targeting 4...,[INST] Generate tweet for hp (@HP) targeting 4...,


In [None]:
import math
model.eval()
perplexities = []

with torch.no_grad():
    for text in tqdm(results_df['reference']):
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        outputs = model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss
        perplexity = math.exp(loss.item())
        perplexities.append(perplexity)

print("\n" + "="*70)
print("ðŸ“ˆ PERPLEXITY")
print("="*70)
print(f"Mean Perplexity: {sum(perplexities)/len(perplexities):.4f}")

results_df['perplexity'] = perplexities


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [00:00<00:00, 20.02it/s]


ðŸ“ˆ PERPLEXITY
Mean Perplexity: 3.6349



