<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/CustomTrainingOnTranscripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# Mount Google Drive and define base path
# =========================================================
from google.colab import drive
import os

# Safe mount: avoids duplicate mount warnings
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Define base path for project files
BASE_PATH = "/content/drive/MyDrive/ArabicVideoSummariser"
os.makedirs(BASE_PATH, exist_ok=True)
print(f"Base path set to: {BASE_PATH}")

In [None]:
# Remove all broken HF packages
!pip uninstall -y transformers huggingface_hub tokenizers accelerate diffusers gradio

In [None]:
!pip install -q \
  "transformers==4.46.3" \
  "huggingface-hub==0.35.3" \
  "tokenizers==0.20.3" \
  "datasets==2.19.1" \
  "evaluate>=0.4.2,<0.5.0"  \
  "rouge-score==0.1.2" \
  "bert-score==0.3.13" \
  "matplotlib==3.8.4" \
  "pandas==2.2.2" \
  "accelerate>=0.30.0,<0.35.0"

In [None]:
# ============================================================
# Custom Fine-Tuning on Arabic Transcript–Summary Pairs
# (ROUGE-L + BERTScore + SemanticSim)
# ============================================================

import os, torch, evaluate, numpy as np, warnings
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer,
    EarlyStoppingCallback
)
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bert_score_fn
from rouge_score import rouge_scorer, scoring

BASE_DIR = "/content/drive/MyDrive/ArabicVideoSummariser"
EXCEL_PATH = os.path.join(BASE_DIR, "transcripts.xlsx")

# ---- Clean up warning clutter ----
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import transformers
transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---- Load dataset ----
df = pd.read_excel(EXCEL_PATH)
df = df.dropna(subset=['transcript', 'summary']).reset_index(drop=True)

# ---- Split into train / val / test (80 / 10 / 10) ----
train_df, val_df, test_df = np.split(
    df.sample(frac=1, random_state=42),
    [int(0.8 * len(df)), int(0.9 * len(df))]
)
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

# ---- Choose base model ----
#BASE_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
#BASE_MODEL = "moussaKam/AraBART"

BASE_MODEL = "ahmeddbahaa/AraBART-finetuned-ar"
MODEL_NAME = BASE_MODEL.split("/")[-1]
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)

max_input_length = 512
max_target_length = 128

def preprocess(batch):
    model_inputs = tokenizer(batch["transcript"], max_length=max_input_length, truncation=True)
    labels = tokenizer(batch["summary"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# ---- Metrics ----
def load_safe_metrics():
    """Load ROUGE-L and BERTScore safely, even if HF evaluate fails."""
    try:
        rouge = evaluate.load("rouge")
    except Exception:
        print("Using local fallback for ROUGE-L")
        class RougeFallback:
            def compute(self, predictions, references, use_stemmer=True):
                scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=use_stemmer)
                aggregator = scoring.BootstrapAggregator()
                for p, r in zip(predictions, references):
                    aggregator.add_scores(scorer.score(r, p))
                result = aggregator.aggregate()
                return {"rougeL": result["rougeL"].mid.fmeasure}
        rouge = RougeFallback()

    try:
        bertscore = evaluate.load("bertscore")
    except Exception:
        print("Using local fallback for BERTScore")
        class BertFallback:
            def compute(self, predictions, references, lang="ar"):
                P, R, F1 = bert_score_fn(predictions, references, lang=lang, verbose=False)
                return {"f1": F1.tolist()}
        bertscore = BertFallback()

    return rouge, bertscore

rouge, bertscore = load_safe_metrics()

# ---- Load LaBSE model for semantic similarity ----
sem_model = SentenceTransformer("sentence-transformers/LaBSE").to(device)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds < 0, 0, preds)
    preds = np.where(preds >= tokenizer.vocab_size, 0, preds)
    labels = np.where(labels < 0, 0, labels)
    labels = np.where(labels >= tokenizer.vocab_size, 0, labels)

    decoded_preds  = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds  = [p.strip() if p.strip() else "[EMPTY]" for p in decoded_preds]
    decoded_labels = [l.strip() if l.strip() else "[EMPTY]" for l in decoded_labels]

    # ---- ROUGE-L ----
    r = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    rougeL = float(r.get("rougeL", 0.0))

    # ---- BERTScore ----
    b = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="ar")
    f1_values = b.get("f1", 0.0)
    bert_f1_mean = float(np.mean(f1_values)) if isinstance(f1_values, (list, np.ndarray)) else float(f1_values)

    # ---- Semantic similarity (LaBSE) ----
    emb_pred = sem_model.encode(decoded_preds, convert_to_tensor=True, show_progress_bar=False)
    emb_ref  = sem_model.encode(decoded_labels, convert_to_tensor=True, show_progress_bar=False)
    cosine_scores = util.cos_sim(emb_pred, emb_ref).diagonal().detach().cpu().numpy()
    semantic_mean = float(np.mean(cosine_scores))

    return {
        "rougeL": rougeL,
        "bertscore_f1": bert_f1_mean,
        "semantic_sim": semantic_mean
    }

# ---- Load model ----
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

# ---- Training setup ----
args = Seq2SeqTrainingArguments(
    output_dir=f"/content/{MODEL_NAME}_finetuned_ROUGEL_SEMANTIC",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="semantic_sim",
    greater_is_better=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    logging_strategy="epoch",
    report_to="none",
    disable_tqdm=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ---- Train ----
trainer.train()

# ---- Evaluate on test set ----
metrics = trainer.evaluate(tokenized_ds["test"])
wanted = {k: v for k, v in metrics.items() if k in (
    "eval_rougeL", "eval_bertscore_f1", "eval_semantic_sim"
)}
print("\n===== Final Test Metrics =====")
for k in ["eval_rougeL", "eval_bertscore_f1", "eval_semantic_sim"]:
    if k in wanted and isinstance(wanted[k], (int, float)):
        print(f"{k}: {wanted[k]:.4f}")


In [None]:
# ============================================================
# 📦 Save Fine-Tuned Model to Drive
# Target: /content/drive/MyDrive/Videosummarisation/models/<timestamped_model_name>
# ============================================================
import os
from datetime import datetime

# ---- Define target path ----
SAVE_ROOT = "/content/drive/MyDrive/Videosummarisation/models"
os.makedirs(SAVE_ROOT, exist_ok=True)

# Create timestamped folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
save_dir = os.path.join(SAVE_ROOT, f"{MODEL_NAME}_finetuned_{timestamp}")
os.makedirs(save_dir, exist_ok=True)

print(f"💾 Saving fine-tuned model to: {save_dir}")

# ---- Save model, tokenizer, and generation config ----
try:
    model.save_pretrained(save_dir, safe_serialization=False)
    tokenizer.save_pretrained(save_dir)
    if hasattr(model, "generation_config"):
        model.generation_config.save_pretrained(save_dir)
    print("✅ Model, tokenizer, and generation config saved successfully.")
except Exception as e:
    print(f" Save failed: {e}")

# ---- Optional: save metrics summary ----
metrics_path = os.path.join(save_dir, "final_test_metrics.txt")
try:
    with open(metrics_path, "w", encoding="utf-8") as f:
        f.write("===== Final Test Metrics =====\n")
        for k, v in wanted.items():
            f.write(f"{k}: {v:.4f}\n")
    print(f"📄 Metrics saved to {metrics_path}")
except Exception as e:
    print(f" Metrics file not written: {e}")

# ---- Optional: Zip the folder (for backup or download) ----
# import shutil
# zip_path = f"{save_dir}.zip"
# shutil.make_archive(save_dir, "zip", save_dir)
# print(f" Zipped model saved as: {zip_path}")

print("\n Model Saved in  'Videosummarisation/models/'")


In [None]:
# ============================================================
# FULL TEST-SET EVALUATION (ROUGE-L + BERTScore + Semantic-Sim)
# ============================================================
import numpy as np, pandas as pd, matplotlib.pyplot as plt, textwrap, random, os, evaluate, warnings, torch
from sentence_transformers import SentenceTransformer, util

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# ---------- 0) Helpers ----------
def safe_batch_decode(tokenizer, sequences):
    vocab = tokenizer.vocab_size
    clean = []
    for seq in sequences:
        if isinstance(seq, np.ndarray): seq = seq.tolist()
        seq = [int(i) for i in seq if 0 <= i < vocab]
        clean.append(seq or [tokenizer.pad_token_id])
    return tokenizer.batch_decode(clean, skip_special_tokens=True)

def load_metrics():
    os.environ.pop("HF_EVALUATE_OFFLINE", None)
    os.environ.pop("HF_DATASETS_OFFLINE", None)
    try:
        rouge = evaluate.load("rouge")
    except Exception:
        from rouge_score import rouge_scorer, scoring
        class RougeFallback:
            def compute(self, predictions, references, use_stemmer=True):
                scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=use_stemmer)
                agg = scoring.BootstrapAggregator()
                for p,r in zip(predictions,references): agg.add_scores(scorer.score(r,p))
                res = agg.aggregate()
                return {"rougeL": res["rougeL"].mid.fmeasure}
        rouge = RougeFallback()
    try:
        bertscore = evaluate.load("bertscore")
    except Exception:
        from bert_score import score
        class BertFallback:
            def compute(self, predictions, references, lang="ar"):
                P,R,F1 = score(predictions,references,lang=lang,verbose=False)
                return {"precision":P.tolist(),"recall":R.tolist(),"f1":F1.tolist()}
        bertscore = BertFallback()
    return rouge, bertscore

rouge, bertscore = load_metrics()

# ---------- 1) Generate predictions ----------
print("🚀 Generating summaries for the entire test set...")
pred_out = trainer.predict(tokenized_ds["test"], max_length=128)
pred_ids = pred_out.predictions[0] if isinstance(pred_out.predictions, tuple) else pred_out.predictions
pred_texts = safe_batch_decode(tokenizer, pred_ids)

ref_texts = list(dataset["test"]["summary"])
src_texts = list(dataset["test"]["transcript"])

# ---------- 2) Compute per-sample metrics ----------
print("⚙️ Computing ROUGE-L, BERTScore, and Semantic-Sim (LaBSE)...")

# --- Initialize Semantic Model (LaBSE) ---
device = "cuda" if torch.cuda.is_available() else "cpu"
sem_model = SentenceTransformer("sentence-transformers/LaBSE").to(device)

# --- ROUGE & BERTScore ---
rouge_scores = rouge.compute(predictions=pred_texts, references=ref_texts)
bert_scores = bertscore.compute(predictions=pred_texts, references=ref_texts, lang="ar")

rougeL_per_sample = []
semantic_sim_per_sample = []

for p, r in zip(pred_texts, ref_texts):
    # ROUGE-L
    tmp = rouge.compute(predictions=[p], references=[r])
    rougeL = tmp.get("rougeL", 0.0) * 100
    rougeL_per_sample.append(rougeL)

    # Semantic Similarity (cosine over LaBSE embeddings)
    emb_pred = sem_model.encode(p, convert_to_tensor=True)
    emb_ref  = sem_model.encode(r, convert_to_tensor=True)
    sim = float(util.cos_sim(emb_pred, emb_ref).cpu().item()) * 100
    semantic_sim_per_sample.append(sim)

bert_f1_per_sample = np.array(bert_scores["f1"]) * 100.0

# ---------- 3) Build a per-sample dataframe ----------
def clip(s, n=320):
    s = (s or "").replace("\n"," ")
    return (s[:n]+"…") if len(s)>n else s

df = pd.DataFrame({
    "idx": np.arange(len(pred_texts)),
    "ROUGE_L_%": np.round(rougeL_per_sample,2),
    "BERT_F1_%": np.round(bert_f1_per_sample,2),
    "Semantic_Sim_%": np.round(semantic_sim_per_sample,2),
    "Prediction": [clip(t) for t in pred_texts],
    "Reference":  [clip(t) for t in ref_texts],
    "Transcript_snip": [clip(t) for t in src_texts]
})

OUT_DIR = "/content/eval_outputs_full"
os.makedirs(OUT_DIR, exist_ok=True)
out_csv = os.path.join(OUT_DIR,"test_full_metrics.csv")
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"✅ Saved per-sample metrics → {out_csv}")

# ---------- 4) Summary statistics ----------
mean_rougeL = float(df["ROUGE_L_%"].mean())
mean_bert   = float(df["BERT_F1_%"].mean())
mean_sem    = float(df["Semantic_Sim_%"].mean())

print(f"\n===== {MODEL_NAME} =====")
print(f"Mean ROUGE-L: {mean_rougeL:.2f}%")
print(f"Mean BERTScore F1: {mean_bert:.2f}%")
print(f"Mean Semantic-Sim: {mean_sem:.2f}%")

# ---------- 5) Plot distributions ----------
plt.figure(figsize=(14,4))


plt.subplot(1,3,1)
plt.hist(df["BERT_F1_%"], bins=20, color="skyblue", edgecolor="black")
plt.title("BERTScore F1 Distribution"); plt.xlabel("F1 (%)"); plt.ylabel("Count"); plt.grid(alpha=0.3)

plt.subplot(1,3,2)
plt.hist(df["Semantic_Sim_%"], bins=20, color="mediumseagreen", edgecolor="black")
plt.title("Semantic Similarity (LaBSE) Distribution"); plt.xlabel("Cosine Similarity (%)"); plt.ylabel("Count"); plt.grid(alpha=0.3)

plt.subplot(1,3,3)
plt.hist(df["ROUGE_L_%"], bins=20, color="lightcoral", edgecolor="black")
plt.title("ROUGE-L Distribution"); plt.xlabel("ROUGE-L (%)"); plt.ylabel("Count"); plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,"metric_distributions.png"), dpi=150)
plt.show()

# ---------- 6) Show representative samples ----------
best_i  = int(df["Semantic_Sim_%"].idxmax())
worst_i = int(df["Semantic_Sim_%"].idxmin())

def show_case(row, title):
    print("\n"+"="*80)
    print(title)
    print("="*80)
    print(f"[idx={int(row['idx'])}] | ROUGE-L: {row['ROUGE_L_%']:.2f}% | BERT F1: {row['BERT_F1_%']:.2f}% | Semantic-Sim: {row['Semantic_Sim_%']:.2f}%")
    print("\nTranscript (snippet):\n"+textwrap.fill(row["Transcript_snip"],width=100))
    print("\nReference:\n"+textwrap.fill(row["Reference"],width=100))
    print("\nPrediction:\n"+textwrap.fill(row["Prediction"],width=100))

print("\n===== Qualitative Test Samples =====")
show_case(df.loc[best_i],  "BEST SAMPLE (Semantic Similarity)")
show_case(df.loc[worst_i], "WORST SAMPLE (Semantic Similarity)")
for rnd in random.sample(list(df.index), k=min(2,len(df))):
    show_case(df.loc[rnd],  "RANDOM SAMPLE")

print(f"\n🖼️ Figures & CSV saved under: {OUT_DIR}")
