In [41]:
#
!pip install transformers datasets peft accelerate bitsandbytes trl



In [42]:
!pip install evaluate



In [43]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the dataset from Hugging Face
dataset = load_dataset("tatsu-lab/alpaca")
'''
data = {
    "question": [
        "What is artificial intelligence?",
        "Who wrote the book '1984'?",
        "What is the capital of France?"
    ],
    "answer": [
        "Artificial intelligence is the simulation of human intelligence in machines.",
        "George Orwell wrote the book '1984'.",
        "The capital of France is Paris."
    ]
}
'''
print(dataset)
print(dataset["train"][0])  # Show first example
'''
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 10570
    })
})

'''



DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


"\nDatasetDict({\n    train: Dataset({\n        features: ['instruction', 'input', 'output', 'text'],\n        num_rows: 87599\n    })\n    validation: Dataset({\n        features: ['instruction', 'input', 'output', 'text'],\n        num_rows: 10570\n    })\n})\n\n"

In [44]:
# Cell 1
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
model_name = MODEL_NAME  # For later use (PDF report)


In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

In [46]:
# Cell 2
from datasets import load_dataset

# Load Alpaca dataset
dataset = load_dataset("yahma/alpaca-cleaned")

# Create train/test split if not already present
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.3)


In [47]:
dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 36232
    })
    test: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 15528
    })
})

In [48]:
# Cell 3
prefix = "Please answer this question: "

def preprocess_function(examples):
    """Add prefix, tokenize input and output."""
    if "input" in examples:
        inputs = [
            prefix + instr + (" " + inp if inp else "")
            for instr, inp in zip(examples["instruction"], examples["input"])
        ]
    else:
        inputs = [prefix + instr for instr in examples["instruction"]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    labels = tokenizer(
        text_target=examples["output"],
        max_length=512,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [49]:
print(dataset.column_names)


{'train': ['output', 'input', 'instruction'], 'test': ['output', 'input', 'instruction']}


In [50]:
# Cell 4
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/36232 [00:00<?, ? examples/s]

Map:   0%|          | 0/15528 [00:00<?, ? examples/s]

In [51]:
# Cell 5
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


In [52]:
!pip install reportlab



In [53]:
# Cell 6
!pip install -q transformers datasets peft accelerate bitsandbytes trl reportlab

import os
import torch
from transformers import Trainer, TrainingArguments, set_seed
from datetime import datetime
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

set_seed(42)

if torch.cuda.is_available():
    model = model.to("cuda")
    torch.backends.cuda.matmul.allow_tf32 = True

output_dir = "./flan_t5_lora_alpaca"
batch_size = 4
num_epochs = 3

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=num_epochs,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_dataset["test"].select(range(min(200, len(tokenized_dataset["test"])))),
    data_collator=data_collator,
    tokenizer=tokenizer
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# cell 7
# Cell 7
train_result = trainer.train()
trainer.save_model(output_dir)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,2.6935
100,2.5601
150,2.5463
200,2.538


In [None]:
# Cell 8
def generate(model, tokenizer, instruction, input_text=None, max_new_tokens=128):
    if input_text and len(input_text.strip()) > 0:
        prompt = f"{prefix}{instruction}\n\nInput: {input_text}"
    else:
        prompt = f"{prefix}{instruction}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_instructions = [
    ("Give three tips for staying healthy.", None),
    ("Explain overfitting in simple terms.", None),
    ("Translate to French:", "How are you today?"),
    ("Summarize:", "Large language models are trained on vast amounts of data to predict the next token."),
    ("Write a Python function to compute Fibonacci numbers.", None),
]

model.eval()
samples = []
for instr, inp in test_instructions:
    out = generate(model, tokenizer, instr, inp)
    samples.append((instr, inp, out))
    print("Instruction:", instr)
    if inp:
        print("Input:", inp)
    print("Model output:", out)
    print("=" * 80)


In [None]:
# Cell 9
pdf_path = os.path.join(output_dir, "report.pdf")

def create_pdf(
    pdf_path,
    model_name,
    dataset_name,
    lora_cfg: LoraConfig,
    epochs,
    train_samples,
    eval_samples,
    sample_generations
):
    c = canvas.Canvas(pdf_path, pagesize=A4)
    width, height = A4
    margin = 40
    y = height - margin

    def write_line(text, size=10, lead=14):
        nonlocal y
        if y < margin + 50:
            c.showPage()
            y = height - margin
        c.setFont("Helvetica", size)
        c.drawString(margin, y, text)
        y -= lead

    write_line("Instruction Tuning Report", size=16, lead=20)
    write_line(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    write_line("")
    write_line("=== Model & Dataset ===", size=12, lead=16)
    write_line(f"Base model: {model_name}")
    write_line(f"Dataset: {dataset_name}")
    write_line("")
    write_line("=== LoRA Config ===", size=12, lead=16)
    write_line(f"r={lora_cfg.r}, alpha={lora_cfg.lora_alpha}, dropout={lora_cfg.lora_dropout}, bias={lora_cfg.bias}")
    write_line(f"task_type={lora_cfg.task_type}")
    write_line("")
    write_line("=== Training ===", size=12, lead=16)
    write_line(f"Epochs: {epochs}")
    write_line(f"Train samples (used): {train_samples}")
    write_line(f"Eval samples (used): {eval_samples}")
    write_line("")
    write_line("=== Sample Generations ===", size=12, lead=16)

    for i, (instr, inp, out) in enumerate(sample_generations, 1):
        write_line(f"{i}. Instruction: {instr}")
        if inp:
            write_line(f"   Input: {inp}")
        write_line(f"   Output: {out[:500]}")
        write_line("")

    c.save()

create_pdf(
    pdf_path=pdf_path,
    model_name=model_name,
    dataset_name="yahma/alpaca-cleaned",
    lora_cfg=lora_config,
    epochs=num_epochs,
    train_samples=2000,
    eval_samples=min(200, len(tokenized_dataset["test"])),
    sample_generations=samples
)

print(f"\nPDF report saved to: {pdf_path}")
