<a href="https://colab.research.google.com/github/Samortchy/personal-AI-projects/blob/main/Video-To-Text%20Summrization/Video_Summrization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # Enhanced cleaning for abstractive models (preserves context)
# def clean_text(text):
#     text = re.sub(r'\[.*?\]|\(.*?\)', '', text)  # Remove brackets/parentheses
#     text = re.sub(r'\s+', ' ', text).strip()     # Fix whitespace
#     return text

# df_english['article'] = df_english['article'].apply(clean_text)
# df_english['highlights'] = df_english['highlights'].apply(clean_text)

# df_english.head()

In [None]:
!pip install nltk==3.8.1

# Download tokenizer data
nltk.download('punkt', download_dir='/root/nltk_data')
nltk.data.path.append('/root/nltk_data')

# Download stopwords
# nltk.download('stopwords')

In [None]:
!pip install --upgrade --force-reinstall --no-cache-dir numpy camel-tools

In [None]:
!pip install camel-tools

In [None]:
!pip install stanza

In [None]:
!pip install sentence-transformers

In [None]:
!pip install --upgrade --force-reinstall sentence-transformers transformers

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # Use cu118 for Colab

# Reinstall transformers and sentence-transformers to ensure they link correctly with the new torch installation
!pip install --upgrade --force-reinstall transformers
!pip install --upgrade --force-reinstall sentence-transformers

In [None]:
# Import the necessary libraries
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from nltk.corpus import stopwords, wordnet
# Explicitly import PreTrainedModel from modeling_utils
from transformers.modeling_utils import PreTrainedModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from collections import Counter
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.morphology.analyzer import Analyzer
import stanza
from nltk.stem import WordNetLemmatizer


In [None]:
# Download ALL NLTK data
import nltk
try:
    nltk.download('all', download_dir='/root/nltk_data')
    print("NLTK 'all' data downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK 'all' data: {e}")

# Ensure the path is included
nltk.data.path.append('/root/nltk_data')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Data Collection**

In [None]:
# @title arabic dataset

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_arabic = pd.read_parquet("hf://datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset/" + splits["train"])
df_arabic_test = pd.read_parquet("hf://datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset/" + splits["test"])
df_arabic_val = pd.read_parquet("hf://datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset/" + splits["validation"])

df_arabic = pd.concat([df_arabic, df_arabic_test, df_arabic_val], axis=0)

In [None]:
# @title English dataset
!unzip '/content/drive/Shareddrives/data science/englishDataset.zip'

In [None]:
df_english = pd.read_csv('cnn_dailymail/train.csv', nrows=20000)

# **Data Exploration**

##**Helper functions**

In [None]:
# @title A function to find all the unique chars in the two datasets
def find_unique_chars(df, feature_name):
  # Combine all text into one large string
  all_text = " ".join(df[feature_name].astype(str).values)

  # Extract all characters that are NOT:
  # - English letters (a-z, A-Z)
  # - Arabic letters (\u0621-\u064A)
  # - Digits (0-9)
  # - Whitespace
  allowed_chars = re.findall(r"[^\w\s\u0621-\u064A]", all_text)

  char_counts = Counter(allowed_chars)

  for char, count in char_counts.most_common(300):
      print(f"'{char}': {count}")


In [None]:
# @title A function to detect the number of emojis

def detect_emojis_count(df,feature_name):
  emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002700-\U000027BF"  # other symbols
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

  return df[feature_name].apply(lambda x: bool(emoji_pattern.search(str(x)))).sum()

In [None]:
 # @title chars that are repeated or spammed i.e(!!!!)

# def garbage_chars(df, feature_name="text"):
#     repeated_garbage_pattern = re.compile(r"([^\w\s\u0621-\u064A])\1{2,}")

#     garbage_mask = df[feature_name].apply(lambda x: bool(repeated_garbage_pattern.search(str(x))))

#     count = garbage_mask.sum()
#     print("Number of rows with repeated garbage characters:", count)

#     for text in df.loc[garbage_mask, feature_name]:
#         matches = repeated_garbage_pattern.findall(str(text))
#         print(f"Text: {text}")
#         print(f"Repeated garbage chars found: {matches}")
#         print("----")

## **Arabic Dataset**

In [None]:
 df_arabic.head()

In [None]:
print(df_arabic.shape)
print(df_arabic_test.shape)
print(df_arabic_val.shape)

In [None]:
# prompt: Drop any feature rather thann the text , summary in arabic dataset

df_arabic = df_arabic[['text', 'summary']]
df_arabic_test = df_arabic_test[['text', 'summary']]
df_arabic_val = df_arabic_val[['text', 'summary']]

In [None]:
df_arabic.describe()

In [None]:
df_arabic.info()

In [None]:
df_arabic.isnull().sum()

In [None]:
emoji_count_text = detect_emojis_count(df_arabic,"text")
emoji_count_summary = detect_emojis_count(df_arabic,"summary")

print(f"Number of rows with emojis in 'text': {emoji_count_text}")
print(f"Number of rows with emojis in 'summary': {emoji_count_summary}")

In [None]:
find_unique_chars(df_arabic,"text")

In [None]:
find_unique_chars(df_arabic,"summary")

## **English Dataset**

In [None]:
 df_english.head()

In [None]:
df_english.shape

In [None]:
df_english.duplicated().sum()


In [None]:
df_english.describe()

In [None]:
df_english.info()

In [None]:
df_english.isnull().sum()

In [None]:
emoji_count_article = detect_emojis_count(df_english,"article")
emoji_count_highlights = detect_emojis_count(df_english,"highlights")

print(f"Number of rows with emojis in 'article': {emoji_count_article}")
print(f"Number of rows with emojis in 'highlights': {emoji_count_highlights}")

In [None]:
find_unique_chars(df_english,"article")

In [None]:
find_unique_chars(df_english,"highlights")

# **Preprocessing**

## **Helper functions**

In [None]:
# @title removes emojis

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

In [None]:
# @title removes all unique chars under 500 occurences

def remove_rare_unique_chars(df, feature_name):
    all_text = " ".join(df[feature_name].astype(str).values)
    allowed_chars = re.findall(r"[^\w\s\u0621-\u064A]", all_text)
    char_counts = Counter(allowed_chars)

    for char, count in char_counts.items():
        if count <= 500:
            df[feature_name] = df[feature_name].astype(str).apply(lambda x: x.replace(char, ''))
    return df


In [None]:
# @title handles foreign names in arabic dataset

def normalize_foreign_tokens(text):
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'\b[a-zA-Z][a-zA-Z0-9_]{1,}\b', '<NAME>', text)
    return text


In [None]:
# @title normalization in  arabic dataset
def normalize_arabic(text):
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "ء", text)
    text = re.sub(r"ئ", "ء", text)
    text = re.sub(r"ً|ٌ|ٍ|َ|ُ|ِ|ّ|ْ", "", text)
    text = re.sub(r"ـ", "", text)
    text = re.sub(r"ه", "ة", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
# @title arabic dataset lemmatizer

def download_arabic_model():
    stanza.download('ar')

# Initialize pipeline once and reuse it
nlp_arabic = stanza.Pipeline('ar', processors='tokenize,pos,lemma', use_gpu=False)

def lemmatize_arabic_text(text: str) -> str:
    doc = nlp_arabic(text)
    lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words]
    return ' '.join(lemmas)

# Example usage:
#download_arabic_model()  # Run once, comment after #the model is downloaded don't re-download except if it is missing
# print(lemmatize_arabic_text("الطلاب يذهبون إلى المدرسة"))


In [None]:
def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'): return wordnet.ADJ
    elif nltk_tag.startswith('V'): return wordnet.VERB
    elif nltk_tag.startswith('N'): return wordnet.NOUN
    elif nltk_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

In [None]:
 # @title arabic dataset stemming
# # @title arabic dataset stemmer

# import stanza
# from camel_tools.tokenizers.word import simple_word_tokenize

# def download_arabic_model():
#     stanza.download('ar')

# # Initialize pipeline once and reuse it
# nlp_arabic = stanza.Pipeline('ar', processors='tokenize,pos,lemma', use_gpu=False)

# def stem_arabic_text(text: str) -> str:
#     """
#     Stems Arabic text using Stanza.
#     """
#     doc = nlp_arabic(text)
#     stems = [word.lemma for sentence in doc.sentences for word in sentence.words]
#     return ' '.join(stems)

# # Example usage:
# #download_arabic_model() # Run this line once to download the model
# print(stem_arabic_text("الطلاب يذهبون إلى المدرسة"))


In [None]:
 # @title english dataset lemmatizer

def lemmatize_english_text(tokens):
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, nltk_pos_tagger(pos)) for word, pos in pos_tags]

In [None]:
 # @title english dataset stemming

# # @title english dataset stemmer

# from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

# def stem_english_text(tokens, stemmer_type='porter'):
#     """
#     Stems English text using different NLTK stemmers.

#     Args:
#         tokens: List of tokens (words).
#         stemmer_type: Type of stemmer ('porter', 'lancaster', 'snowball').
#     Returns:
#         List of stemmed tokens.
#     """
#     if stemmer_type == 'porter':
#         stemmer = PorterStemmer()
#     elif stemmer_type == 'lancaster':
#         stemmer = LancasterStemmer()
#     elif stemmer_type == 'snowball':
#         stemmer = SnowballStemmer("english")
#     else:
#         raise ValueError("Invalid stemmer_type. Choose from 'porter', 'lancaster', 'snowball'.")

#     return [stemmer.stem(token) for token in tokens]

# # Example usage:
# # text = "running flies quickly ran"
# # tokens = text.split()
# # stemmed_tokens = stem_english_text(tokens, stemmer_type='porter')
# # print(stemmed_tokens)

In [None]:
# @title costum stop word removal
# Custom stopwords (less aggressive for abstractive)
custom_stopwords = set(stopwords.words('english')) - {'not', 'no', 'nor', 'only'}
custom_stopwords.update(['cnn', 'daily mail', 'published', 'said'])

def remove_stopwords(tokens):
    return [token for token in tokens if token not in custom_stopwords]

## **Pipelines**

In [None]:
# @title Preprocessing Pipeline for Traditional ML Algorithms and RNN Models

def clean_for_classic(df, feature_name, is_arabic):
    df = remove_rare_unique_chars(df, feature_name)
    cleaned_tokens = []

    for text in df[feature_name].astype(str):
        text = remove_emojis(text)
        if is_arabic:
            text = normalize_foreign_tokens(text)
            text = normalize_arabic(text)
            tokens = simple_word_tokenize(text)
        else:
            tokens = word_tokenize(text.lower())
            tokens = lemmatize_english_text(tokens)

        tokens = remove_stopwords(tokens)
        cleaned_tokens.append(tokens)

    df[feature_name] = cleaned_tokens
    return df

df_arabic = clean_for_classic(df_arabic, "text", True)
df_arabic = clean_for_classic(df_arabic, "summary", True)

In [None]:
# @title Preprocessing Pipeline for Deep Learning Algorithms and Transformer Models

def clean_for_transformers_deep(df, feature_name, is_arabic):
    """
    Cleans and normalizes a text column for deep learning and transformer models.

    Args:
        df (pd.DataFrame): Input DataFrame.
        feature_name (str): Column name to clean.
        is_arabic (bool): Whether the text is in Arabic.

    Returns:
        pd.DataFrame: Cleaned DataFrame with the same column name.
    """
    cleaned_texts = []

    for text in df[feature_name].astype(str):
        text = remove_emojis(text)
        if is_arabic:
            text = normalize_foreign_tokens(text)
            text = normalize_arabic(text)
        else:
            text = text.lower()
        cleaned_texts.append(text)

    df[feature_name] = cleaned_texts
    return df


In [None]:
def prepare_for_deep_model(df, text_col, summary_col, max_len_input=100, max_len_summary=30, vocab_size=5000):
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    texts = df[text_col].astype(str).values
    summaries = df[summary_col].astype(str).values

    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(np.concatenate((texts, summaries)))

    X = tokenizer.texts_to_sequences(texts)
    y = tokenizer.texts_to_sequences(summaries)

    X = pad_sequences(X, maxlen=max_len_input, padding='post', truncating='post')
    y = pad_sequences(y, maxlen=max_len_summary, padding='post', truncating='post')

    return X, y, tokenizer

# **Models**

In [None]:
sentence_embedding_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')

# Make sure your dataframes are properly loaded
print(f"English dataset shape: {df_english.shape}")
print(f"Arabic dataset shape: {df_arabic.shape}")

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
def evaluate_language_dataset(df, is_arabic=False, language_name="English"):
    """Evaluates models on a dataset with train/test reports for a single language"""
    # Prepare dataset
    text_col = 'text' if is_arabic else 'article'
    summary_col = 'summary' if is_arabic else 'highlights'

    df = df.dropna().head(500)
    df['sentences'] = df[text_col].astype(str).apply(sent_tokenize)
    df[summary_col] = df[summary_col].astype(str)

    data = []
    for _, row in df.iterrows():
        sentences = [clean_for_transformers(s, is_arabic) for s in row['sentences']]
        highlight = clean_for_transformers(row[summary_col], is_arabic)

        sent_embeddings = sentence_embedding_model.encode(sentences)
        hl_embedding = sentence_embedding_model.encode([highlight])[0]

        similarities = cosine_similarity(sent_embeddings, [hl_embedding]).flatten()
        labels = (similarities >= 0.5).astype(int)

        for i, sent in enumerate(sentences):
            data.append({'sentence': sent, 'embedding': sent_embeddings[i], 'label': labels[i]})

    # Prepare train/test
    df_sentences = pd.DataFrame(data)
    X = np.stack(df_sentences['embedding'].values)
    y = df_sentences['label'].values

    # Dimensionality reduction for tree-based models
    n_components = min(50, X.shape[1])  # Ensure we don't exceed available features
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    X_train_reduced, X_test_reduced = train_test_split(
        X_reduced, test_size=0.2, random_state=42, stratify=y
    )

    # Define models with separate processing paths
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(
                penalty='l2', C=0.1, max_iter=1000,
                class_weight='balanced', solver='liblinear'
            ),
            'use_pca': False
        },
        'Random Forest': {
            'model': RandomForestClassifier(
                n_estimators=100, max_depth=10,
                min_samples_leaf=5, max_features='sqrt',
                class_weight='balanced', random_state=42,
                n_jobs=-1  # Enable parallel processing
            ),
            'use_pca': False
        },
        'SVM': {
            'model': SVC(
                kernel='linear', C=0.1,
                class_weight='balanced', probability=True
            ),
            'use_pca': False
        },
        'XGBoost': {
            'model': XGBClassifier(
                eval_metric='logloss',
                max_depth=4, reg_alpha=0.1, reg_lambda=1,
                subsample=0.8, colsample_bytree=0.8,
                learning_rate=0.05, n_estimators=100,
                random_state=42
            ),
            'use_pca': False
        },
        'MLP': {
            'model': MLPClassifier(
                hidden_layer_sizes=(128, 64), alpha=0.001,
                max_iter=300, early_stopping=True,
                learning_rate_init=0.001, random_state=42
            ),
            'use_pca': False
        }
    }

    print(f"\n{'='*60}")
    print(f"🌍 {language_name.upper()} DATASET EVALUATION")
    print(f"{'='*60}")

    for name, config in models.items():
        model = config['model']
        use_pca = config['use_pca']

        print(f"\n🔹 Model: {name}")

        # Select appropriate data version
        if use_pca:
            train_X = X_train_reduced
            test_X = X_test_reduced
            print("(Using PCA-reduced features)")
        else:
            train_X = X_train
            test_X = X_test
            print("(Using full-dimensional embeddings)")

        model.fit(train_X, y_train)

        print("\nTRAIN SET PERFORMANCE:")
        train_pred = model.predict(train_X)
        print(classification_report(y_train, train_pred))

        print("\nTEST SET PERFORMANCE:")
        test_pred = model.predict(test_X)
        print(classification_report(y_test, test_pred))

    print("\n" + "="*60 + "\n")
    return models  # Return trained models for further use

In [None]:
def evaluate_bilingual_models(df_english, df_arabic):
    """Evaluates models on both English and Arabic datasets and returns them"""
    print("\n🔸 Evaluating English Models...")
    models_english = evaluate_language_dataset(df_english, is_arabic=False, language_name="English")

    print("\n🔸 Evaluating Arabic Models...")
    models_arabic = evaluate_language_dataset(df_arabic, is_arabic=True, language_name="Arabic")

    return models_english, models_arabic


In [None]:
 # @title Machine Learning Models

# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import nltk


# # -------------------------------
# # 🔹 Step 1: Load SentenceTransformer (Multilingual for Arabic/English)
# # -------------------------------
# sentence_embedding_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')

# # -------------------------------
# # 🔹 Step 2: Prepare Sentence-Level Dataset
# # -------------------------------
# df_english = df_english.dropna().head(500)  # sample for speed

# df_english['sentences'] = df_english['article'].astype(str).apply(sent_tokenize)
# df_english['highlights'] = df_english['highlights'].astype(str)

# data = []

# for _, row in df_english.iterrows():
#     sentences = row['sentences']
#     highlight = row['highlights']

#     sentence_embeddings = model.encode(sentences)
#     highlight_embedding = model.encode([highlight])[0]

#     similarities = cosine_similarity(sentence_embeddings, [highlight_embedding]).flatten()
#     labels = (similarities >= 0.5).astype(int)

#     for i, sent in enumerate(sentences):
#         data.append({'sentence': sent, 'embedding': sentence_embeddings[i], 'label': labels[i]})

# df_sentences = pd.DataFrame(data)
# X = np.stack(df_sentences['embedding'].values)
# y = df_sentences['label'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # -------------------------------
# # 🔹 Step 3: Define Models
# # -------------------------------
# models = {
#     'Logistic Regression': LogisticRegression(
#         penalty='l2',
#         C=0.1,
#         max_iter=1000,
#         class_weight='balanced',
#         solver='liblinear'
#     ),

#     'Random Forest': RandomForestClassifier(
#         n_estimators=100,
#         max_depth=10,
#         min_samples_leaf=5,
#         max_features='sqrt',
#         class_weight='balanced',
#         random_state=42
#     ),

#     'SVM': SVC(
#         kernel='linear',
#         C=0.1,
#         class_weight='balanced',
#         probability=True  # optional if you need predict_proba
#     ),

#     'XGBoost': XGBClassifier(
#         use_label_encoder=False,
#         eval_metric='logloss',
#         max_depth=4,
#         reg_alpha=0.1,
#         reg_lambda=1,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         learning_rate=0.05,
#         n_estimators=100,
#         random_state=42
#     ),

#     'MLP': MLPClassifier(
#         hidden_layer_sizes=(128, 64),
#         alpha=0.001,
#         max_iter=300,
#         early_stopping=True,
#         learning_rate_init=0.001,
#         random_state=42
#     )
# }

# # -------------------------------
# # 🔹 Step 4: Train & Evaluate
# # -------------------------------
# for name, model in models.items():
#     print(f"\n🔹 Model: {name}")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(classification_report(y_test, y_pred))


In [None]:
# for name, model in models.items():
#     print(f"\n🔹 Model: {name}")
#     model.fit(X_train, y_train)

#     # Predict on train set
#     y_train_pred = model.predict(X_train)
#     print("Train Classification Report:")
#     print(classification_report(y_train, y_train_pred))

#     # Predict on test set
#     y_test_pred = model.predict(X_test)
#     print("Test Classification Report:")
#     print(classification_report(y_test, y_test_pred))

# **# Evaluation Metrics**

### ROUGE: Overlap of words/phrases between your generated summary and a human-written reference summary.

In [None]:
pip install numpy nltk rouge-score

### **BLUE**: How "natural" your generated summary sounds compared to human references.


In [None]:
import random
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_rouge(generated_summary, reference_summary, n=1):
    scorer = rouge_scorer.RougeScorer([f'rouge{n}'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores[f'rouge{n}']

def calculate_bleu(generated_summary, reference_summary):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_summary.split()], generated_summary.split(), smoothing_function=smoothie)

In [None]:
def evaluate_summaries_with_rouge_bleu(df_english, models_english):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    cumulative_scores = {
        'rouge1': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
        'rouge2': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
        'rougeL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
    }
    cumulative_bleu = 0.0

    random_indices = random.sample(range(len(df_english)), 10)

    for idx, article_idx in enumerate(random_indices, start=1):
        try:
            article_row = df_english.iloc[article_idx]
            article_text = str(article_row['article'])
            reference_summary = str(article_row['highlights'])

            article_sentences = sent_tokenize(article_text)
            sentence_embeddings = sentence_embedding_model.encode(article_sentences)

            # Use the MLP model
            predicted_labels = models_english['MLP'].predict(sentence_embeddings)

            generated_summary_sentences = [
                article_sentences[i] for i, label in enumerate(predicted_labels) if label == 1
            ]
            generated_summary = " ".join(generated_summary_sentences)

            scores = scorer.score(reference_summary, generated_summary)

            for key in cumulative_scores:
                cumulative_scores[key]['precision'] += scores[key].precision
                cumulative_scores[key]['recall'] += scores[key].recall
                cumulative_scores[key]['f1'] += scores[key].fmeasure

            bleu_score = calculate_bleu(generated_summary, reference_summary)
            cumulative_bleu += bleu_score

            # Print output
            print(f"\n--- Article {idx} ---")
            print("\nOriginal Article:")
            print(article_text[:500] + '...' if len(article_text) > 500 else article_text)

            print("\nReference Summary:")
            print(reference_summary)

            print("\nGenerated Summary:")
            print(generated_summary if generated_summary else "[No summary sentences predicted]")

            print("\nROUGE Scores:")
            for key, val in scores.items():
                print(f"{key.upper()} - Precision: {val.precision:.4f}, Recall: {val.recall:.4f}, F1: {val.fmeasure:.4f}")

            print(f"BLEU Score: {bleu_score:.4f}")
            print("\n" + "="*60)

        except Exception as e:
            print(f"Error processing article {idx}: {e}")
            continue

    # Average scores
    print("\n--- AVERAGE SCORES OVER 10 ARTICLES ---")
    for key in cumulative_scores:
        avg_precision = cumulative_scores[key]['precision'] / 10
        avg_recall = cumulative_scores[key]['recall'] / 10
        avg_f1 = cumulative_scores[key]['f1'] / 10
        print(f"{key.upper()} - Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}, Avg F1: {avg_f1:.4f}")

    print(f"Average BLEU Score: {cumulative_bleu / 10:.4f}")

In [None]:
# Evaluate both languages
evaluate_generation_metrics(df_english, is_arabic=False, language_name="English")
evaluate_generation_metrics(df_arabic, is_arabic=True, language_name="Arabic")

### *METEOR* (Metric for Evaluation of Translation with Explicit ORdering): Like BLEU but more flexible: accounts for synonyms, stemming, and word order.

In [None]:
import nltk
nltk.download('wordnet')  # Downloads WordNet corpus
nltk.download('omw-1.4')  # Optional: Open Multilingual WordNet (for non-English)

In [None]:
from nltk.translate.meteor_score import meteor_score

reference = "the cat sat on the mat".split()
candidate = "a cat is sitting on a mat".split()
score = meteor_score([reference], candidate)
print(f"METEOR Score: {score:.4f}")

### Test Evaluation Metrics

In [None]:
# Initialize the ROUGE scorer
import random
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Storage for cumulative scores
cumulative_scores = {
    'rouge1': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
    'rouge2': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
    'rougeL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
}

# Pick 10 random articles
random_indices = random.sample(range(len(df_english)), 10)

for idx, article_idx in enumerate(random_indices, start=1):
    article_row = df_english.iloc[article_idx]
    article_text = str(article_row['article'])
    reference_highlights = str(article_row['highlights'])

    # Tokenize article into sentences
    article_sentences = sent_tokenize(article_text)

    try:
        # Generate sentence embeddings
        sentence_embeddings = sentence_embedding_model.encode(article_sentences)

        # Predict labels (0: Not Summary, 1: Summary)
        predicted_labels = models['MLP'].predict(sentence_embeddings)

        # Extract predicted summary sentences
        generated_summary_sentences = [
            article_sentences[i] for i, label in enumerate(predicted_labels) if label == 1
        ]
        generated_summary = " ".join(generated_summary_sentences)

        # Compute ROUGE scores
        scores = scorer.score(reference_highlights, generated_summary)

        # Accumulate scores for average computation
        for rouge_key in cumulative_scores:
            cumulative_scores[rouge_key]['precision'] += scores[rouge_key].precision
            cumulative_scores[rouge_key]['recall'] += scores[rouge_key].recall
            cumulative_scores[rouge_key]['f1'] += scores[rouge_key].fmeasure

        # Output for this article
        print(f"\n--- Article {idx} ---")
        print("\nOriginal Article:")
        print(article_text[:500] + '...' if len(article_text) > 500 else article_text)

        print("\nReference Highlights (Human-Written):")
        print(reference_highlights)

        print("\nGenerated Summary (Model Prediction):")
        print(generated_summary if generated_summary else "[No summary sentences predicted]")

        print("\nROUGE Scores:")
        for key, val in scores.items():
            print(f"{key.upper()} - Precision: {val.precision:.4f}, Recall: {val.recall:.4f}, F1: {val.fmeasure:.4f}")

        print("\n" + "="*60)

    except Exception as e:
        print(f"Error processing article {idx}: {e}")
        continue

# --- Average Scores Across All Articles ---
print("\n--- AVERAGE ROUGE SCORES OVER 10 ARTICLES ---")
for key in cumulative_scores:
    avg_precision = cumulative_scores[key]['precision'] / 10
    avg_recall = cumulative_scores[key]['recall'] / 10
    avg_f1 = cumulative_scores[key]['f1'] / 10
    print(f"{key.upper()} - Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}, Avg F1: {avg_f1:.4f}")


In [None]:
# Initialize the ROUGE scorer
from rouge_score import rouge_scorer
import random

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Storage for cumulative scores
cumulative_scores = {
    'rouge1': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
    'rouge2': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
    'rougeL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
}

# Pick 10 random articles from Arabic dataset
random_indices = random.sample(range(len(df_arabic)), 10)

for idx, article_idx in enumerate(random_indices, start=1):
    article_row = df_arabic.iloc[article_idx]
    article_text = str(article_row['article'])
    reference_highlights = str(article_row['highlights'])

    # Tokenize Arabic article into sentences (adjust tokenizer if needed)
    try:
        article_sentences = sent_tokenize(article_text)  # Replace if needed with arabic-specific tokenizer

        # Generate sentence embeddings
        sentence_embeddings = sentence_embedding_model.encode(article_sentences)

        # Predict labels (0: Not Summary, 1: Summary)
        predicted_labels = models['MLP'].predict(sentence_embeddings)

        # Extract predicted summary sentences
        generated_summary_sentences = [
            article_sentences[i] for i, label in enumerate(predicted_labels) if label == 1
        ]
        generated_summary = " ".join(generated_summary_sentences)

        # Compute ROUGE scores
        scores = scorer.score(reference_highlights, generated_summary)

        # Accumulate scores
        for rouge_key in cumulative_scores:
            cumulative_scores[rouge_key]['precision'] += scores[rouge_key].precision
            cumulative_scores[rouge_key]['recall'] += scores[rouge_key].recall
            cumulative_scores[rouge_key]['f1'] += scores[rouge_key].fmeasure

        # Output for this article
        print(f"\n--- Article {idx} ---")
        print("\nOriginal Arabic Article:")
        print(article_text[:500] + '...' if len(article_text) > 500 else article_text)

        print("\nReference Highlights (Human-Written):")
        print(reference_highlights)

        print("\nGenerated Summary (Model Prediction):")
        print(generated_summary if generated_summary else "[No summary sentences predicted]")

        print("\nROUGE Scores:")
        for key, val in scores.items():
            print(f"{key.upper()} - Precision: {val.precision:.4f}, Recall: {val.recall:.4f}, F1: {val.fmeasure:.4f}")

        print("\n" + "="*60)

    except Exception as e:
        print(f"Error processing article {idx}: {e}")
        continue

# --- Average Scores Across All Arabic Articles ---
print("\n--- AVERAGE ROUGE SCORES OVER 10 ARABIC ARTICLES ---")
for key in cumulative_scores:
    avg_precision = cumulative_scores[key]['precision'] / 10
    avg_recall = cumulative_scores[key]['recall'] / 10
    avg_f1 = cumulative_scores[key]['f1'] / 10
    print(f"{key.upper()} - Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}, Avg F1: {avg_f1:.4f}")


In [None]:
# Sample Arabic article
arabic_article ="""
ذهبت اليوم إلى مركز المجتمع المحلي للمساعدة في حملة تنظيف نهاية الأسبوع. تم تنظيم الحملة من قبل مجموعة من طلاب المرحلة الثانوية الذين أرادوا جعل الحديقة أنظف وأكثر أمانًا للأطفال.
بدأنا في وقت مبكر من الصباح حوالي الساعة 8 صباحًا، وقد أحضر الجميع أدواتهم الخاصة مثل المكانس والقفازات وأكياس القمامة. كان الفريق متحمسًا، وكان هناك موسيقى تُعزف في الخلفية أثناء عملنا.
بحلول الظهر، كنا قد ملأنا أكثر من 20 كيس قمامة كبير. وجد أحد الطلاب دراجة قديمة مدفونة تحت بعض الشجيرات، وساعدنا في إخراجها ووضعها جانبًا لإعادة التدوير.
بعد التنظيف، جلسنا تحت الشجرة الكبيرة في وسط الحديقة وتشاركنا الوجبات الخفيفة والماء. بدا الجميع متعبين لكنهم فخورون بما أنجزناه في بضع ساعات فقط.
شكرنا قائد المجموعة وذكر أنهم يخططون لتنظيم فعاليات مماثلة كل شهر.
"""

# Tokenize the article into sentences
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# If you need to handle Arabic sentence splitting better, you can use the `arabicstopwords` or `farasa` if available
sentences = sent_tokenize(arabic_article)

# Encode sentences using your model
sentence_embeddings = sentence_embedding_model.encode(sentences)

# Predict labels using the trained model (Random Forest, MLP, etc.)
predicted_labels = models['RandomForest'].predict(sentence_embeddings)

# Extract predicted summary
summary_sentences = [
    sentence for sentence, label in zip(sentences, predicted_labels) if label == 1
]
generated_summary = " ".join(summary_sentences)

print("🔹 ملخص المقال:")
print(generated_summary if generated_summary else "[لم يتم التنبؤ بأي جملة ملخصة]")

In [None]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# English article
english_article ="""
The event was organized by a group of high school students who wanted to make the park area cleaner and safer for kids.

We started early in the morning around 8 a.m., and everyone brought their own tools like brooms, gloves, and trash bags. The team was enthusiastic, and there was even music playing in the background while we worked.

By noon, we had already filled more than 20 large garbage bags. One of the students found an old bicycle buried under some bushes, and we helped pull it out and set it aside to be recycled.

After the cleanup, we sat under the big tree in the center of the park and shared snacks and water. Everyone looked tired but proud of the difference we made in just a few hours.

The leader of the group thanked us all and mentioned that they plan to organize similar events every month.
"""

# 1. Split into sentences
sentences = sent_tokenize(english_article)

# 2. Get embeddings (assumes sentence_embedding_model is a valid encoder like SBERT)
sentence_embeddings = sentence_embedding_model.encode(sentences)

# 3. Predict which sentences belong in the summary
predicted_labels = models['RandomForest'].predict(sentence_embeddings)

# 4. Build the summary
summary_sentences = [s for s, label in zip(sentences, predicted_labels) if label == 1]
summary = " ".join(summary_sentences)

print("🔹 English Summary:")
print(summary if summary else "[No summary sentences predicted]")
