<a href="https://colab.research.google.com/github/Raniamea/arabic-video-summarisation/blob/main/notebooks/05_summarise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [None]:
!pip install -q \
  "transformers==4.46.3" \
  "tokenizers==0.20.3" \
  "datasets==2.19.1" \
  "evaluate>=0.4.2,<0.5.0"  \
  "rouge-score==0.1.2" \
  "bert-score==0.3.13" \
  "accelerate>=0.30.0,<0.35.0" \
   sentence-transformers \
  "sentencepiece>=0.1.99" "sacremoses"


#  Load Model & Define File Paths

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, json, re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

base_path = "/content/drive/MyDrive/ArabicVideoSummariser"
MODEL_PATH = os.path.join(base_path,"models/AraBART-finetuned-ar_finetuned_20251018_2017")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
print ("Loaded" + MODEL_PATH)

In [None]:
video_filename = "NileTaxi.mp4"

video_name  = os.path.splitext(video_filename)[0]
validated_path   = os.path.join(base_path, "Validated")
transcripts_path   = os.path.join(base_path, "transcripts")
summaries_path   = os.path.join(base_path, "summaries")

refrance_file= os.path.join(summaries_path, f"{video_name}_Refrance.txt")
validated_file=os.path.join(validated_path, f"{video_name}_result.txt")
transcript_file  = os.path.join(transcripts_path, f"{video_name}_ar.txt")
validation_SupportScore_results  = os.path.join(validated_path, f"{video_name}_SupportScore_result.txt")
validation_CosineScore_results  = os.path.join(validated_path, f"{video_name}_CosineScore_result.txt")
wholesummary_file = os.path.join(summaries_path, f"{video_name}_WholeSummary.txt")
sceneCosine_summary_file = os.path.join(summaries_path, f"{video_name}_BasedOnCosineSimilarity_Summary.txt")
sceneSupport_summary_file = os.path.join(summaries_path, f"{video_name}_BasedOnSupportScore_Summary.txt")


# Helper Functions

In [None]:
import os, torch
import evaluate

# --- Load metrics ---
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ============================================
# Evaluation: ROUGE & BERTScore
# ============================================
def eval_ar_summary(summary):
    with open(refrance_file, "r", encoding="utf-8") as f:
        reference_summary = f.read().strip()

    # --- Arabic normalization (light) ---
    def normalize_ar(text: str) -> str:
        text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", text)  # remove diacritics
        text = re.sub(r"[إأآا]", "ا", text)
        text = text.replace("ى", "ي").replace("ة", "ه").replace("ؤ", "و").replace("ئ", "ي")
        text = re.sub(r"\s+", " ", text).strip()
        return text

    ref_norm = normalize_ar(reference_summary)
    pred_norm = normalize_ar(summary)

    # --- Compute ROUGE (character-level tokenizer for Arabic) ---
    rouge_result = rouge.compute(
        predictions=[pred_norm],
        references=[ref_norm],
        tokenizer=lambda x: list(x)  # char-level tokenization avoids Arabic zero-score issue
    )

    print("=== ROUGE Scores ===")
    for k, v in rouge_result.items():
        print(f"{k:10s}: {v:.4f}")

    # --- Compute BERTScore ---
    bertscore_result = bertscore.compute(
        predictions=[pred_norm],
        references=[ref_norm],
        lang="ar",                     # keep language flag
        model_type="xlm-roberta-base"  # Arabic-friendly model
    )

    print("\n=== BERTScore ===")
    print(f"Precision : {bertscore_result['precision'][0]:.4f}")
    print(f"Recall    : {bertscore_result['recall'][0]:.4f}")
    print(f"F1        : {bertscore_result['f1'][0]:.4f}")

    print(summary)


# ============================================================
# Helper: Summarize a text file and save output
# ============================================================
def summarize_file(
    input_path: str,
    output_path: str,
    input_max_len: int = 1024,
    max_new_tokens: int = 400,
    min_new_tokens: int = 80,
    num_beams: int = 4,
    no_repeat_ngram_size: int = 3,
    length_penalty: float = 1.0,
):
    # Safety checks
    assert os.path.exists(input_path), f"Input file not found: {input_path}"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    # Read input
    with open(input_path, "r", encoding="utf-8") as f:
        full_text = f.read().strip()

    if not full_text:
        raise ValueError(f"Input file is empty: {input_path}")

    # Encode (will truncate to input_max_len by request)
    inputs = tokenizer(
        full_text,
        return_tensors="pt",
        truncation=True,
        max_length=input_max_len
    ).to(device)

    # Generate
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            length_penalty=length_penalty,
            early_stopping=True
        )

    summary = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()

    # Save
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(summary)

    print(f"Summary saved to: {output_path}")
    eval_ar_summary(summary)

#Video Summarisation

In [None]:
# ============================
# Summarize Full Transcript
# ============================
summarize_file(
    input_path=transcript_file,
    output_path=wholesummary_file,        # where to save the summary
    input_max_len=1024,
    max_new_tokens=400,
    min_new_tokens=80,
    num_beams=4,
)

In [None]:
# ============================
# Summarize Validated Cosine Similarity transcript
# ============================
summarize_file(
    input_path=validation_CosineScore_results,
    output_path=sceneCosine_summary_file,
    input_max_len=1024,
    max_new_tokens=400,
    min_new_tokens=80,
    num_beams=4,
)


In [None]:
# ============================
# Summarize Validated fused score (cosine + lexical) transcript
# ============================
summarize_file(
    input_path=validation_SupportScore_results,
    output_path=sceneSupport_summary_file,
    input_max_len=1024,
    max_new_tokens=400,
    min_new_tokens=80,
    num_beams=4,
)