In [None]:
from google.colab import drive
drive.mount('/content/drive')


tokenization

In [None]:
import pandas as pd
import re
import html
import warnings
import os
import torch

from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# -------------------------------
# Suppress warnings
# -------------------------------
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# -------------------------------
# Paths
# -------------------------------
DATA_PATH = "/content/drive/MyDrive/NewsSumm Dataset.xlsx"
SAVE_DIR  = "/content/drive/MyDrive/tokenized_mistral_batches1"

TRAIN_DIR = os.path.join(SAVE_DIR, "train")
TEST_DIR  = os.path.join(SAVE_DIR, "test")

os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_excel(DATA_PATH)

# CHANGE: include article_text
df = df.dropna(subset=["human_summary", "article_text"])
print("Rows after cleaning:", len(df))

# -------------------------------
# Resample
# -------------------------------
df = df.sample(n=15000, random_state=42)
print("Rows after resampling:", len(df))

# -------------------------------
# Clean text
# -------------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text(" ")
    text = html.unescape(text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "", text)
    text = re.sub(r"\+?\d[\d\s\-]{8,}\d", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["human_summary"] = df["human_summary"].apply(clean_text)
df["article_text"]  = df["article_text"].apply(clean_text)

# -------------------------------
# Train / Test split
# -------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# CHANGE: input = article, label = summary
train_texts = train_df["article_text"].astype(str).tolist()
test_texts  = test_df["article_text"].astype(str).tolist()

train_labels = train_df["human_summary"].astype(str).tolist()
test_labels  = test_df["human_summary"].astype(str).tolist()

print("Train rows:", len(train_texts))
print("Test rows:", len(test_texts))

# -------------------------------
# Load Mistral tokenizer
# -------------------------------
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

# -------------------------------
# Tokenization settings (SAFE)
# -------------------------------
BATCH_SIZE = 4
MAX_LEN = 128

# -------------------------------
# Batch tokenization function
# -------------------------------
def tokenize_and_save(inputs, labels, out_dir, split_name):

    print(f"\nTokenizing {split_name} set...")

    batch_id = 0

    for i in range(0, len(inputs), BATCH_SIZE):

        batch_inputs = inputs[i:i+BATCH_SIZE]
        batch_labels = labels[i:i+BATCH_SIZE]

        enc_inputs = tokenizer(
            batch_inputs,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        enc_labels = tokenizer(
            batch_labels,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        batch_data = {
            "input_ids": enc_inputs["input_ids"],
            "attention_mask": enc_inputs["attention_mask"],
            "labels": enc_labels["input_ids"]
        }

        save_path = os.path.join(out_dir, f"batch_{batch_id}.pt")
        torch.save(batch_data, save_path)

        if batch_id % 50 == 0:
            print(f"{split_name}: saved batch {batch_id}")

        batch_id += 1

    print(f"{split_name} tokenization done. Total batches: {batch_id}")

# -------------------------------
# Run tokenization
# -------------------------------
tokenize_and_save(train_texts, train_labels, TRAIN_DIR, "TRAIN")
tokenize_and_save(test_texts, test_labels, TEST_DIR, "TEST")

print("\nMistral-7B article→summary batch-wise tokenization completed successfully")


In [None]:
!pip install -q bitsandbytes accelerate peft transformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h

model_training

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model

# --------------------
# Paths
# --------------------
TRAIN_DIR = "/content/drive/MyDrive/tokenized_mistral_batches1/train/"
SAVE_DIR  = "/content/drive/MyDrive/mistral_qlora_finetuned1"
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

# --------------------
# Device
# --------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --------------------
# Tokenizer
# --------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# --------------------
# QLoRA config
# --------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# --------------------
# Load model
# --------------------
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# ✅ keep this (fixes grad error)
model.enable_input_require_grads()


model.config.use_cache = False

# --------------------
# LoRA config
# --------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.train()

# --------------------
# Optimizer
# --------------------
optimizer = AdamW(model.parameters(), lr=2e-4)

# --------------------
# Load batches
# --------------------
files = sorted([f for f in os.listdir(TRAIN_DIR) if f.endswith(".pt")])
print("Total batches:", len(files))

total_loss = 0

# --------------------
# Training loop
# --------------------
for step, fname in enumerate(files, 1):

    batch = torch.load(os.path.join(TRAIN_DIR, fname))

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    optimizer.zero_grad()

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

    loss = outputs.loss
    loss.backward()

    optimizer.step()

    total_loss += loss.item()

    if step % 50 == 0:
        print(f"Step {step} | Avg Loss: {total_loss / step:.4f}")

    del batch, input_ids, attention_mask, labels, outputs, loss
    torch.cuda.empty_cache()

# --------------------
# Save adapters
# --------------------
os.makedirs(SAVE_DIR, exist_ok=True)

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("✅ QLoRA Mistral training completed successfully (hybrid fast mode)")


model evaluation

In [None]:
!pip install rouge-score


In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from rouge_score import rouge_scorer

# --------------------
# Paths
# --------------------
TEST_DIR  = "/content/drive/MyDrive/tokenized_mistral_batches1/test/"
ADAPTER_DIR = "/content/drive/MyDrive/mistral_qlora_finetuned1"
BASE_MODEL = "mistralai/Mistral-7B-v0.1"

# --------------------
# Device
# --------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --------------------
# Tokenizer  (FIXED)
# --------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"   # ✅ MUST be right

# --------------------
# QLoRA config
# --------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# --------------------
# Load base model
# --------------------
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto"
)

# --------------------
# Load adapters
# --------------------
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()

# --------------------
# ROUGE
# --------------------
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

r1 = r2 = rl = 0.0
r1p = r2p = rlp = 0.0
count = 0

files = sorted([f for f in os.listdir(TEST_DIR) if f.endswith(".pt")])
print("Test batches:", len(files))

# --------------------
# Evaluation loop
# --------------------
for fname in files:

    batch = torch.load(os.path.join(TEST_DIR, fname))

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    # ✅ FIX labels
    labels = batch["labels"].clone()
    labels[labels == -100] = tokenizer.pad_token_id

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=120,   # ✅ longer
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # remove prompt
    gen_only = outputs


    preds = tokenizer.batch_decode(gen_only, skip_special_tokens=True)
    refs  = tokenizer.batch_decode(labels, skip_special_tokens=True)

    for pred, ref in zip(preds, refs):
        scores = scorer.score(ref, pred)

        r1  += scores["rouge1"].fmeasure
        r2  += scores["rouge2"].fmeasure
        rl  += scores["rougeL"].fmeasure

        r1p += scores["rouge1"].precision
        r2p += scores["rouge2"].precision
        rlp += scores["rougeL"].precision

        count += 1

# --------------------
# Results
# --------------------
print("\nROUGE RESULTS (Mistral QLoRA)")
print("ROUGE-1 F1:", round(r1 / count, 4))
print("ROUGE-2 F1:", round(r2 / count, 4))
print("ROUGE-L F1:", round(rl / count, 4))
print("ROUGE-1 Precision:", round(r1p / count, 4))
print("ROUGE-2 Precision:", round(r2p / count, 4))
print("ROUGE-L Precision:", round(rlp / count, 4))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Data
df = pd.DataFrame({
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-1", "ROUGE-2", "ROUGE-L"],
    "Score": [0.1137, 0.0271, 0.0749, 0.5254, 0.3050, 0.3951],
    "Model": ["Summary-only"]*3 + ["Article and Summary"]*3
})

# Plot
sns.barplot(data=df, x="Metric", y="Score", hue="Model")
plt.title("ROUGE Comparison")
plt.show()
