In [1]:
# --- 1. Install necessary libraries ---
!pip install -q datasets transformers accelerate peft bitsandbytes torch
# Make sure to upgrade if you had issues with older versions
!pip install --upgrade transformers peft accelerate datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments # Import Trainer and TrainingArguments

In [4]:
# --- 4. Define paths and constants ---
# IMPORTANT: This path must match exactly where your checkpoints are saved in Google Drive.
# Ensure 'flan-t5-lora-checkpoints' was shared with this new account.
BASE_CHECKPOINT_DIR = "/content/drive/MyDrive/flan-t5-lora-checkpoints"
RESUME_CHECKPOINT_PATH = os.path.join(BASE_CHECKPOINT_DIR, "checkpoint-1000")

In [5]:
# Define the base model name (must be the same as the one you originally fine-tuned)
MODEL_NAME = "google/flan-t5-base"

# Special tokens added during the initial training
SPECIAL_TOKENS = ["<summarize>", "<mcq>"]

# Max lengths used during initial tokenization
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 256

In [6]:
# --- 5. Load Dataset (necessary to build the Trainer's internal structure) ---
# IMPORTANT: Ensure your 'flan_combined_30k.jsonl' file is accessible from this account too.
# If it's in the same Drive, it should be fine.
JSONL_FILE_PATH = '/content/drive/My Drive/combined_mcq_summarization_finetune.jsonl'

try:
    df = pd.read_json(JSONL_FILE_PATH, lines=True)
    dataset = Dataset.from_pandas(df)
    print("Dataset loaded and converted to Hugging Face Dataset.")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {JSONL_FILE_PATH}. Please check the path and sharing settings.")
    exit() # Exit if dataset can't be loaded, as training depends on it
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    exit()

Dataset loaded and converted to Hugging Face Dataset.


In [7]:
# --- 6. Initialize Tokenizer and Model (Crucial for LoRA and resuming) ---
print("\n--- Initializing Tokenizer and Model ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Re-add special tokens and resize embeddings (this must match original training)
tokenizer.add_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))
print(f"Tokenizer vocab size after adding tokens: {len(tokenizer)}")

# Re-apply LoRA configuration (this must match original training)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


--- Initializing Tokenizer and Model ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizer vocab size after adding tokens: 32102
trainable params: 884,736 || all params: 248,422,656 || trainable%: 0.3561


In [12]:
# --- 7. Define Tokenization Function (must be identical to original) ---
def tokenize(example):
    input_text = example["input_text"].strip()
    output_text = example["target_text"].strip()

    if input_text.lower().startswith("summarize"):
        prefix = "<summarize> "
    elif input_text.lower().startswith("generate mcqs"):
        prefix = "<mcq> "
    else:
        prefix = ""

    model_inputs = tokenizer(
        prefix + input_text,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )

    labels = tokenizer(
        output_text,
        max_length=MAX_OUTPUT_LENGTH,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenizer to the dataset
print("\n--- Tokenizing Dataset ---")
# Ensure columns to remove match your original dataset to avoid issues
columns_to_remove = [col for col in dataset.column_names if col not in ['input_ids', 'attention_mask', 'labels']]
tokenized_dataset = dataset.map(tokenize, remove_columns=columns_to_remove)
print(f"Tokenized dataset size: {len(tokenized_dataset)}")
print(f"Tokenized dataset columns: {tokenized_dataset.column_names}")


--- Tokenizing Dataset ---


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Tokenized dataset size: 20000
Tokenized dataset columns: ['input_ids', 'attention_mask', 'labels']


In [13]:
# --- 8. Initialize Data Collator ---
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/flan-t5-lora-checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5, # Keep the lower learning rate for now, it's safer
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=20,
    logging_strategy="steps", # Explicitly set logging strategy
    fp16=False,
    do_train=True,
    report_to="tensorboard" # <--- CHANGE THIS
)

In [15]:
# --- 10. Initialize the Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
# --- 11. Resume Training ---
# Crucial step: Pass the resume_from_checkpoint argument
print(f"\n--- Attempting to resume training from: {RESUME_CHECKPOINT_PATH} ---")

if os.path.exists(RESUME_CHECKPOINT_PATH):
    trainer.train(resume_from_checkpoint=RESUME_CHECKPOINT_PATH)
    print("\nTraining resumed successfully!")
else:
    print(f"\nError: Checkpoint path '{RESUME_CHECKPOINT_PATH}' not found. "
          "Please ensure the folder is correctly shared and accessible, "
          "and the path is accurate. Starting training from scratch if no checkpoint found.")
    # If the checkpoint path is not found, it will start training from epoch 0.
    # You might want to remove the `resume_from_checkpoint` argument
    # or handle this case specifically if you *only* want to resume.
    trainer.train()

print("\n--- Training session ended ---")


--- Attempting to resume training from: /content/drive/MyDrive/flan-t5-lora-checkpoints/checkpoint-1000 ---


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1020,1.7695
1040,1.6936
1060,1.7161
1080,1.7235
1100,1.7141
1120,1.6816
1140,1.749
1160,1.7137
1180,1.7172
1200,1.7148





Training resumed successfully!

--- Training session ended ---


In [17]:
# Assuming you want to save the final model to a specific directory within your output_dir
final_model_save_path = "/content/drive/MyDrive/flan-t5-lora-checkpoints/last_final_model"

print(f"Saving final model to: {final_model_save_path}")
# Save the PEFT adapters
trainer.model.save_pretrained(final_model_save_path)
# Save the tokenizer
tokenizer.save_pretrained(final_model_save_path)

print("Final model and tokenizer saved successfully.")

Saving final model to: /content/drive/MyDrive/flan-t5-lora-checkpoints/last_final_model




Final model and tokenizer saved successfully.


In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig # Make sure PeftConfig is imported

# --- Configuration (kept for context, assuming this part is correct) ---
model_name = "google/flan-t5-base"
fine_tuned_model_path = "/content/drive/My Drive/flan-t5-lora-checkpoints/last_final_model" # Double-check this path if it's 'last_final_model'

# --- 1. Load the Tokenizer (with special tokens) ---
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
special_tokens = ["<summarize>", "<mcq>"]
tokenizer.add_tokens(special_tokens, special_tokens=True) # Ensure they are re-added

# --- 2. Load the Original FLAN-T5 Base Model ---
print("--- Loading Original FLAN-T5 Base Model ---")
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
original_model.resize_token_embeddings(len(tokenizer))
original_model.eval()
original_model.to("cuda")
print("Original model loaded.")

# --- 3. Load the Fine-Tuned FLAN-T5 Model with LoRA ---
print(f"--- Loading Fine-Tuned LoRA Model from {fine_tuned_model_path} ---")
base_model_for_lora = AutoModelForSeq2SeqLM.from_pretrained(model_name)
base_model_for_lora.resize_token_embeddings(len(tokenizer)) # CRUCIAL FIX FROM LAST TIME
lora_model = PeftModel.from_pretrained(base_model_for_lora, fine_tuned_model_path)
lora_model.eval()
lora_model.to("cuda")
print("Fine-tuned LoRA model loaded.")


# --- 4. Define a Generation Function (FIXED HERE) ---
def generate_output(model, tokenizer, input_text, max_length=256, num_beams=4):
    """Generates text from the model given an input."""
    input_ids = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).input_ids.to(model.device)

    # Generate output - Changed to keyword argument
    output_ids = model.generate(
        input_ids=input_ids, # <--- THIS IS THE FIX
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# --- 5. Prepare Test Cases (same as before) ---
test_cases = [
    # --- Summarization Test Cases ---
    {
        "task": "summarize",
        "input_prefix": "<summarize> ",
        "text": "The discovery of penicillin by Alexander Fleming in 1928 marked a revolutionary turning point in medicine. Observing a mold contaminate a petri dish of Staphylococcus bacteria, Fleming noticed that the mold inhibited the bacteria's growth. This accidental discovery led to the development of antibiotics, which have since saved millions of lives by effectively treating bacterial infections that were once fatal. Though it took years of further research by scientists like Howard Florey and Ernst Chain to purify and mass-produce penicillin, its impact on public health remains unparalleled."
    },
    {
        "task": "summarize",
        "input_prefix": "<summarize> ",
        "text": "Climate change refers to long-term shifts in temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change, primarily due to the burning of fossil fuels (like coal, oil, and gas), which produces heat-trapping gases. The consequences of climate change include intense droughts, water scarcity, severe fires, rising sea levels, flooding, melting polar ice, catastrophic storms, and declining biodiversity. Addressing climate change requires global cooperation and a transition to sustainable energy sources."
    },
    {
        "task": "summarize",
        "input_prefix": "<summarize> ",
        "text": "Artificial neural networks (ANNs) are a subset of machine learning, inspired by the structure and function of the human brain. They consist of interconnected nodes (neurons) organized in layers: an input layer, one or more hidden layers, and an output layer. Each connection has a weight, which is adjusted during training to learn patterns from data. ANNs are widely used for tasks such as image recognition, natural language processing, and predictive analytics, forming the backbone of deep learning algorithms."
    },
    {
        "task": "summarize",
        "input_prefix": "<summarize> ",
        "text": "The Renaissance, a fervent period of European cultural, artistic, political, and scientific 'rebirth' after the Middle Ages, spanned roughly from the 14th to the 17th century. Originating in Florence, Italy, it was characterized by a renewed interest in classical Greek and Roman philosophy, art, and literature. This era produced masterpieces by artists like Leonardo da Vinci, Michelangelo, and Raphael, and saw significant advancements in science, leading to the Scientific Revolution. The invention of the printing press by Gutenberg also played a crucial role in disseminating knowledge and ideas."
    },

    # --- MCQ Generation Test Cases ---
    {
        "task": "mcq",
        "input_prefix": "<mcq> ",
        "text": "The Solar System consists of the Sun and everything bound to it by gravity, including the eight planets, dwarf planets, dozens of moons, and millions of asteroids, comets, and meteoroids. The inner planets—Mercury, Venus, Earth, and Mars—are terrestrial planets, meaning they are rocky. The outer planets—Jupiter, Saturn, Uranus, and Neptune—are gas giants, composed primarily of hydrogen and helium. Beyond Neptune is the Kuiper Belt, a region of icy bodies, and further out lies the theoretical Oort Cloud."
    },
    {
        "task": "mcq",
        "input_prefix": "<mcq> ",
        "text": "Water, with its chemical formula H2O, is a transparent, tasteless, odorless, and nearly colorless chemical substance that is the main constituent of Earth's hydrosphere and the fluids of all known living organisms. It is vital for all known forms of life, even though it provides no calories or organic nutrients. Water exists in various forms on Earth: in liquid state as rain, rivers, oceans; in solid state as ice; and in gaseous state as water vapor. Its unique properties, such as high heat capacity and solvent ability, are crucial for life."
    },
    {
        "task": "mcq",
        "input_prefix": "<mcq> ",
        "text": "The Internet is a global system of interconnected computer networks that uses the Internet protocol suite (TCP/IP) to communicate between networks and devices. It is a network of networks that consists of private, public, academic, business, and government networks of local to global scope, linked by a broad array of electronic, wireless, and optical networking technologies. The Internet carries a vast range of information resources and services, such as the inter-linked hypertext documents and applications of the World Wide Web (WWW), electronic mail, telephony, and file sharing."
    },
    {
        "task": "mcq",
        "input_prefix": "<mcq> ",
        "text": "The human eye is a complex optical instrument which collects light from the surrounding environment, focuses it, converts it into electrical impulses, and transmits these impulses to the brain for visual perception. Key parts include the cornea (outer transparent layer), iris (colored part that controls pupil size), pupil (opening that lets light in), lens (focuses light), and retina (light-sensitive tissue containing photoreceptor cells). The optic nerve then transmits signals from the retina to the brain."
    },
    {
        "task": "mcq",
        "input_prefix": "<mcq> ",
        "text": "Gravity is a fundamental force of nature that causes objects with mass or energy to be attracted to each other. It is the weakest of the four fundamental forces, but it is the most dominant force in the Universe on macroscopic scales, controlling the orbits of planets, the formation of stars and galaxies, and the large-scale structure of the cosmos. Isaac Newton's law of universal gravitation describes it as an attractive force between two objects that is proportional to their masses and inversely proportional to the square of the distance between their centers."
    }
]

# (The rest of your Python code for loading models and the generate_output function remains the same)

# Run the comparison script with this updated test_cases list.
# (The rest of your Python code for loading models and the generate_output function remains the same)

# --- 6. Compare Outputs (same as before) ---
print("\n--- Comparing Model Performance ---")

for i, case in enumerate(test_cases):
    print(f"\n===== Test Case {i+1}: {case['task'].upper()} =====")
    full_input = case['input_prefix'] + case['text']
    print(f"\nInput:\n{full_input}")

    # Generate with Original Model
    print("\n--- Original Model Output ---")
    original_output = generate_output(original_model, tokenizer, full_input)
    print(original_output)

    # Generate with Fine-Tuned LoRA Model
    print("\n--- Fine-Tuned LoRA Model Output ---")
    lora_output = generate_output(lora_model, tokenizer, full_input)
    print(lora_output)
    print("-" * 50) # Separator for clarity

--- Loading Original FLAN-T5 Base Model ---
Original model loaded.
--- Loading Fine-Tuned LoRA Model from /content/drive/My Drive/flan-t5-lora-checkpoints/last_final_model ---
Fine-tuned LoRA model loaded.

--- Comparing Model Performance ---

===== Test Case 1: SUMMARIZE =====

Input:
<summarize> The discovery of penicillin by Alexander Fleming in 1928 marked a revolutionary turning point in medicine. Observing a mold contaminate a petri dish of Staphylococcus bacteria, Fleming noticed that the mold inhibited the bacteria's growth. This accidental discovery led to the development of antibiotics, which have since saved millions of lives by effectively treating bacterial infections that were once fatal. Though it took years of further research by scientists like Howard Florey and Ernst Chain to purify and mass-produce penicillin, its impact on public health remains unparalleled.

--- Original Model Output ---
penicillin

--- Fine-Tuned LoRA Model Output ---
Observing a mold contaminate 