<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/05_summarise_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [1]:
!pip install -q \
  "transformers==4.46.3" \
  "tokenizers==0.20.3" \
  "datasets==2.19.1" \
  "evaluate>=0.4.2,<0.5.0"  \
  "rouge-score==0.1.2" \
  "bert-score==0.3.13" \
  "accelerate>=0.30.0,<0.35.0" \
   sentence-transformers \
  "sentencepiece>=0.1.99" "sacremoses"

!pip install -q evaluate rouge-score sentence-transformers bert-score


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m100.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m34.4 MB/s[0m eta [36m0:00

In [2]:
# ---- Clean up warning ----
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

#  Load Model & Define File Paths

In [3]:
# =========================================================
# Mount Google Drive and define base path
# =========================================================
from google.colab import drive
import os

if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Define base path for project files
BASE_PATH = "/content/drive/MyDrive/ArabicVideoSummariser"
os.makedirs(BASE_PATH, exist_ok=True)


Mounted at /content/drive


In [32]:
# =========================================================
# Obtain Video File Name
# =========================================================
import os, json

params_path = os.path.join(BASE_PATH, "params.json")

#with open(params_path, "r", encoding="utf-8") as f:
#    params = json.load(f)

#video_filename = params.get("video_file")
#assert video_filename, "params.json must include 'video_file'."
video_filename="Qorsaya.mp4"

video_name  = os.path.splitext(video_filename)[0]

In [33]:
# =========================================================
# Define File Paths & Names
# =========================================================
validated_path   = os.path.join(BASE_PATH, f"Validated/Batch/R2/{video_name}")
summaries_path   = os.path.join(BASE_PATH, f"summaries/Batch/R3/{video_name}")

reference_file= os.path.join(BASE_PATH, f"summaries/{video_name}_Reference.txt")
excel_out= os.path.join(BASE_PATH, f"summaries/Batch/R3/{video_name}.xlsx")


In [34]:
# ---------------------------
# Load custom trained model
# ---------------------------
import os, json, re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

MODEL_PATH = os.path.join(BASE_PATH,"models/AraBART-finetuned-ar_finetuned_20251110_1546")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)


# Helper Functions

In [35]:
# ============================================================
# Arabic Summarization Evaluation (ROUGE + BERTScore + LaBSE)
# ============================================================

import os, re, torch
import evaluate
from sentence_transformers import SentenceTransformer, util

# --- Load metrics ---
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labse = SentenceTransformer("sentence-transformers/LaBSE").to(device)

# ============================================
# Evaluation Function
# ============================================
def eval_ar_summary(summary, reference_path=None, *, verbose=False):

    with open(reference_file, "r", encoding="utf-8") as f:
        reference_summary = f.read().strip()

    # --- Arabic normalization (light) ---
    def normalize_ar(text: str) -> str:
        text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", text)  # remove diacritics
        text = re.sub(r"[إأآا]", "ا", text)
        text = text.replace("ى", "ي").replace("ة", "ه").replace("ؤ", "و").replace("ئ", "ي")
        text = re.sub(r"\s+", " ", text).strip()
        return text

    ref_norm  = normalize_ar(reference_summary)
    pred_norm = normalize_ar(summary)

    # --- ROUGE (character-level tokenizer for Arabic) ---
    rouge_result = rouge.compute(
        predictions=[pred_norm],
        references=[ref_norm],
        tokenizer=lambda x: list(x)
    )

    # --- BERTScore ---
    bertscore_result = bertscore.compute(
        predictions=[pred_norm],
        references=[ref_norm],
        lang="ar",
        model_type="xlm-roberta-base"
    )
    # --- LaBSE sentence-level similarity ---
    ref_emb  = labse.encode([ref_norm],  convert_to_tensor=True, normalize_embeddings=True)
    pred_emb = labse.encode([pred_norm], convert_to_tensor=True, normalize_embeddings=True)
    labse_score = float(util.cos_sim(pred_emb, ref_emb).item())

    metrics = {
        "ROUGE1": float(rouge_result["rouge1"]),
        "ROUGE2": float(rouge_result["rouge2"]),
        "ROUGEL": float(rouge_result["rougeL"]),
        "Precision": float(bertscore_result["precision"][0]),
        "Recall": float(bertscore_result["recall"][0]),
        "F1": float(bertscore_result["f1"][0]),
        "LaBSE": labse_score,
    }

    if verbose:
        print("=== ROUGE Scores ===")
        print(f"rouge1    : {metrics['ROUGE1']:.4f}")
        print(f"rouge2    : {metrics['ROUGE2']:.4f}")
        print(f"rougeL    : {metrics['ROUGEL']:.4f}")
        print("\n=== BERTScore ===")
        print(f"Precision : {metrics['Precision']:.4f}")
        print(f"Recall    : {metrics['Recall']:.4f}")
        print(f"F1        : {metrics['F1']:.4f}")
        print("\n=== LaBSE Semantic Similarity ===")
        print(f"Sentence-level cosine similarity: {metrics['LaBSE']:.4f}")
        print("\n--- Generated Summary ---")
        print(summary)

    return metrics

# ============================================================
# Summarization Function
# ============================================================
def summarize_file(
    input_path: str,
    output_path: str,
    input_max_len: int = 1024,
    max_new_tokens: int = 400,
    min_new_tokens: int = 80,
    num_beams: int = 4,
    no_repeat_ngram_size: int = 3,
    length_penalty: float = 1.0,
    repetition_penalty: float = 1.2,
):
    # Safety checks
    assert os.path.exists(input_path), f"Input file not found: {input_path}"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    # Read input
    with open(input_path, "r", encoding="utf-8") as f:
        full_text = f.read().strip()

    if not full_text:
        raise ValueError(f"Input file is empty: {input_path}")

    # Tokenize (truncate if needed)
    inputs = tokenizer(
        full_text,
        return_tensors="pt",
        truncation=True,
        max_length=input_max_len
    ).to(device)

    # Generate summary
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            length_penalty=length_penalty,
            repetition_penalty=repetition_penalty,
            early_stopping=True
        )

    summary = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()

    # Save
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(summary)

    print(f"\nSummary saved to: {output_path}")
    eval_ar_summary(summary)


# Batch Summarisation

In [36]:
# ============================================================
# Batch Summarization + Save to Excel
# ============================================================
import os, re
import pandas as pd
from tqdm import tqdm

results = []

# Pattern: Video_SIM0.35_ALFA0.35
pattern = re.compile(r"(?P<video>.+?)_Validated_SIM(?P<SIM>[\d.]+)_ALFA(?P<ALFA>[\d.]+)")

# --- Loop through all validated files ---
for fname in tqdm(os.listdir(validated_path)):
    m = pattern.match(fname)
    if not m:
        continue

    video = m.group("video")
    # Clean numeric fields to handle trailing dots or underscores
    sim_str = m.group("SIM").rstrip(".")
    alfa_str = m.group("ALFA").rstrip(".")

    try:
        sim = float(sim_str)
        alfa = float(alfa_str)
    except ValueError:
        print(f"Skipping file due to bad SIM/ALFA values: {fname}")
        continue

    input_path = os.path.join(validated_path, fname)
    output_path = os.path.join(summaries_path, f"{video}_SIM{sim}_ALFA{alfa}_summary.txt")

    # Generate summary
    summarize_file(
        input_path=input_path,
        output_path=output_path,
        input_max_len=1024,
        max_new_tokens=400,
        min_new_tokens=80,
        num_beams=4,
    )

    # Evaluate and collect metrics
    with open(output_path, "r", encoding="utf-8") as f:
        summary_text = f.read().strip()

    metrics = eval_ar_summary(summary_text, reference_path=reference_file, verbose=False)

    metrics.update({
        "VideoName": video,
        "ALFA": alfa,
        "SIM_Threshold": sim,
        "Generated_Summary": summary_text,
    })
    results.append(metrics)

# --- Save all results to Excel ---
df = pd.DataFrame(results)
df.to_excel(excel_out, index=False)
print(f"\nBatch summarization complete. Results saved to:\n{excel_out}")


  0%|          | 0/12 [00:00<?, ?it/s]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.2_ALFA0.0_summary.txt


  8%|▊         | 1/12 [00:04<00:46,  4.21s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.2_ALFA0.2_summary.txt


 17%|█▋        | 2/12 [00:05<00:26,  2.63s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.2_ALFA0.25_summary.txt


 25%|██▌       | 3/12 [00:07<00:19,  2.15s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.2_ALFA0.35_summary.txt


 33%|███▎      | 4/12 [00:08<00:15,  1.93s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.25_ALFA0.0_summary.txt


 42%|████▏     | 5/12 [00:10<00:12,  1.81s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.25_ALFA0.2_summary.txt


 50%|█████     | 6/12 [00:12<00:10,  1.77s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.25_ALFA0.25_summary.txt


 58%|█████▊    | 7/12 [00:13<00:08,  1.72s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.25_ALFA0.35_summary.txt


 67%|██████▋   | 8/12 [00:15<00:06,  1.65s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.35_ALFA0.0_summary.txt


 75%|███████▌  | 9/12 [00:16<00:04,  1.62s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.35_ALFA0.2_summary.txt


 83%|████████▎ | 10/12 [00:18<00:03,  1.61s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.35_ALFA0.25_summary.txt


 92%|█████████▏| 11/12 [00:20<00:01,  1.74s/it]


Summary saved to: /content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya/Qorsaya_SIM0.35_ALFA0.35_summary.txt


100%|██████████| 12/12 [00:22<00:00,  1.84s/it]


Batch summarization complete. Results saved to:
/content/drive/MyDrive/ArabicVideoSummariser/summaries/Batch/R3/Qorsaya.xlsx



