In [None]:
!pip install datasets
!pip install transformers



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
from sklearn.model_selection import train_test_split
import numpy as np
from dataclasses import dataclass
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from transformers import AutoConfig, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn


In [None]:
df = pd.read_csv("/content/summarizdataset.csv")
df.shape

(8378, 4)

In [None]:
df.drop(columns=['Processed Text'], inplace=True)

In [None]:
import re

# Cleaning function
def clean_arabic_text(text):
    text = str(text)
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)  # Remove tashkeel
    text = re.sub(r'[a-zA-Z0-9]', '', text)                  # Remove Latin chars & digits
    text = re.sub(r'[^\w\s]', '', text)                      # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()                 # Normalize whitespace
    text = re.sub(r'[A-Za-z0-9@#^&*()+=\[\]{}<>\\|/?!~`:";\']+', '', text)  #remove non arabic
     text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "ء", text)
    text = re.sub(r"ئ", "ء", text)
    text = re.sub(r"ة", "ه", text)
    text = re.sub(r"گ", "ك", text)
    return text

# Apply cleaning
df['Processed Text'] = df['text'].apply(clean_arabic_text)
df['summarizer'] = df['summarizer'].apply(clean_arabic_text)

In [None]:
# Ensure both columns are strings
df = df[df['text'].apply(lambda x: isinstance(x, str))]
df = df[df['summarizer'].apply(lambda x: isinstance(x, str))]


# Drop old 'text' column and reorder
df.drop(columns=['text'], inplace=True)
df = df[['Processed Text', 'type', 'summarizer']]

# Save cleaned data
df.to_csv('preprocessed_cleaned.csv', index=False)

In [None]:
df.shape

(8378, 3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df['Processed Text'], df['summarizer'], test_size=0.2, random_state=42)

In [None]:
df_sampled = pd.concat([X_train, Y_train], axis=1)
print(df_sampled.shape)

(6702, 2)


In [None]:
df_sampled = df_sampled.sample(n=2000, random_state=42)
print(df_sampled.shape)

(2000, 2)


In [None]:
df_sampled.to_csv('/content/summarize_train_final.csv', index=False)

In [None]:
d_sampled = pd.concat([X_test, Y_test], axis=1)
print(d_sampled.shape)

(1676, 2)


In [None]:
d_sampled = d_sampled.sample(n=400, random_state=42)
print(d_sampled.shape)

(400, 2)


In [None]:
d_sampled.to_csv('/content/summarize_test_final.csv',index = False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from dataclasses import dataclass

@dataclass
class LabeledExample:
    paragraph: str
    summary: str

class LabeledDataset(Dataset):
    def __init__(self, data: list[LabeledExample]):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained('moussaKam/AraBART')

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx):
        # Unpack the tuple properly (with 2 values: paragraph and summary)
        paragraph, summary = self.data[idx]

        # Ensure paragraph and summary are strings and clean them up
        paragraph = str(paragraph).strip() if isinstance(paragraph, str) else ""
        summary = str(summary).strip() if isinstance(summary, str) else ""

        # Tokenize the paragraph (input)
        inputs = self.tokenizer.encode_plus(
            paragraph,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )

        # Tokenize the summary (labels)
        labels = self.tokenizer.encode_plus(
            summary,
            truncation=True,
            padding='max_length',
            max_length=110,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels['input_ids'].flatten()
        }


In [None]:
# Load and preprocess the labeled dataset
def load_labeled_dataset(file_path, nrows=None):
    df = pd.read_csv(file_path, nrows=nrows)
    dataset = []
    for _, row in df.iterrows():
        paragraph = row['Processed Text']
        summary = row['summarizer']
        dataset.append((paragraph, summary))
    return dataset

In [None]:
# Load the labeled dataset
labeled_dataset = load_labeled_dataset('/content/summarize_train_final.csv')
dataset = LabeledDataset(labeled_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [None]:
def initialize_model(pretrained_model_name, device):

    model_config = AutoConfig.from_pretrained(pretrained_model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name, config=model_config)
    model.to(device)

    return model

device = 'cuda'
model = initialize_model('moussaKam/AraBART', device)


pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]

In [None]:
def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

def initialize_training_components(model, learning_rate):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    return optimizer, criterion


batch_size = 8
learning_rate = 5e-5
data_loader = create_data_loader(dataset, batch_size)
optimizer, criterion = initialize_training_components(model, learning_rate)




In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
def prepare_validation_dataset(file_path, tokenizer_name='moussaKam/AraBART'):

    df = pd.read_csv(file_path)
    validation_data = [(row['example_id'], row['Processed Text'],row['summarizer']) for _, row in df.iterrows()]

    class ValidationDataset(Dataset):
        def __init__(self, data, tokenizer):
            self.data = data
            self.tokenizer = tokenizer

        def __len__(self):
            return len(self.data)

        def __getitem__(self, index):
             example_id, paragraph, summarizer = self.data[index]
             inputs = self.tokenizer.encode_plus(paragraph,
                                                 truncation=True,
                                                 padding='max_length',
                                                 max_length=512,
                                                 return_tensors='pt')
             return {
                    'example_id': example_id,
                    'paragraph': paragraph,
                    'summarizer': summarizer,  # Add this line to include the true summary
                    'input_ids': inputs['input_ids'].squeeze(),
                    'attention_mask': inputs['attention_mask'].squeeze()
                   }


    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    dataset_instance = ValidationDataset(validation_data, tokenizer)

    return dataset_instance


file_path = '/content/summarize_test_final.csv'
validation_dataset = prepare_validation_dataset(file_path)


In [None]:
def create_data_loader(dataset, batch_size, shuffle):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

batch_size = 8
validation_data_loader = create_data_loader(validation_dataset, batch_size, shuffle=False)


In [None]:
def train_one_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits


        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

def evaluate_model(model, validation_loader, device):
    model.eval()

def save_model(model, directory):
    model.save_pretrained(directory)


num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, data_loader, optimizer, device)
    evaluate_model(model, validation_data_loader, device)

save_model(model, 'summrizer_model')




Epoch 1, Loss: 0.8278
Epoch 2, Loss: 0.1541
Epoch 3, Loss: 0.1139
Epoch 4, Loss: 0.0909
Epoch 5, Loss: 0.0740
Epoch 6, Loss: 0.0608
Epoch 7, Loss: 0.0540
Epoch 8, Loss: 0.0418
Epoch 9, Loss: 0.0371
Epoch 10, Loss: 0.0298




In [None]:
# Load the trained model
model_path = './summrizer_model'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained('moussaKam/AraBART')
model.save_pretrained('/content/summrizer_model')
tokenizer.save_pretrained('/content/summrizer_model')

('/content/summrizer_model/tokenizer_config.json',
 '/content/summrizer_model/special_tokens_map.json',
 '/content/summrizer_model/sentencepiece.bpe.model',
 '/content/summrizer_model/added_tokens.json',
 '/content/summrizer_model/tokenizer.json')

In [None]:
!zip -r summrizer_model.zip summrizer_model
from google.colab import files
files.download('summrizer_model.zip')

  adding: summrizer_model/ (stored 0%)
  adding: summrizer_model/generation_config.json (deflated 43%)
  adding: summrizer_model/special_tokens_map.json (deflated 52%)
  adding: summrizer_model/tokenizer_config.json (deflated 76%)
  adding: summrizer_model/model.safetensors (deflated 8%)
  adding: summrizer_model/config.json (deflated 60%)
  adding: summrizer_model/tokenizer.json (deflated 77%)
  adding: summrizer_model/sentencepiece.bpe.model (deflated 57%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import csv
import torch

# Function to generate predictions
def generate_predictions(model, data_loader, tokenizer, device):
    model.to(device)
    model.eval()
    predictions = []
    original_word_counts = []

    for batch in data_loader:
        example_ids = batch['example_id'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        paragraphs = batch['paragraph']

        for example_id, inp_id, att_mask, paragraph in zip(example_ids, input_ids, attention_mask, paragraphs):
            paragraph_word_count = len(tokenizer.tokenize(paragraph))
            original_word_counts.append(paragraph_word_count)

            if inp_id.dim() == 1:
                inp_id = inp_id.unsqueeze(0)

            target_length = max(int(0.47 * paragraph_word_count), inp_id.size(1) + 1)
            inp_id = inp_id.to(device)
            att_mask = att_mask.to(device).unsqueeze(0)

            outputs = model.generate(input_ids=inp_id, attention_mask=att_mask, max_length=target_length)
            summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append({'example_id': example_id.item(), 'summary': summary})

    return predictions, original_word_counts

def save_predictions_to_csv(predictions, file_path):
    with open(file_path, 'w', encoding='utf-8', newline='') as file:
        fieldnames = ['example_id', 'summary']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for prediction in predictions:
            writer.writerow(prediction)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictions, word_counts = generate_predictions(model, validation_data_loader, tokenizer, device)
save_predictions_to_csv(predictions, 'predictions.csv')

In [None]:
def summarize_paragraph(paragraph, model, tokenizer, device, ratio=0.47):

    tokens = tokenizer.encode_plus(
        paragraph,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids, attention_mask = tokens['input_ids'].to(device), tokens['attention_mask'].to(device)

    target_length = max(1, int(ratio * len(tokenizer.tokenize(paragraph))))

    summary_tokens = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=target_length
    )
    summary = tokenizer.decode(summary_tokens[0], skip_special_tokens=True)

    return summary


summary_ratio = 0.50
input_paragraph = "التقنيات الذكية تعزز بشكل كبير التفاعل بين البشر والآلات، وتغير بوضوح نهج الحياة اليومية وطريقة التفكير. فالذكاء الاصطناعي والتعلم الآلي يقدمان حلاً مبتكرًا للتحديات التقنية والاقتصادية، مما يؤدي إلى تطوير العديد من الصناعات وتحسين الخدمات. وبفضل التطور المستمر في هذا المجال، نرى ثورة رقمية تحدث تغييرات عميقة في كافة جوانب الحياة، من العمل إلى الترفيه والتواصل الاجتماعي."
generated_summary = summarize_paragraph(input_paragraph, model, tokenizer, device, summary_ratio)
print("Generated Summary:", generated_summary)


Generated Summary: فالذكاء الاصطناعي والتعلم الآلي يقدمان حلا مبتكرا للتحديات التقنية والاقتصادية مما يؤدي إلى تطوير العديد من الصناعات وتحسين الخدمات وبفضل التطور المستمر في هذا المجال نرى ثورة


In [None]:
summary_ratio = 0.50
input_paragraph = "تغير المناخ هو من أبرز التحديات التي تواجه البشرية اليوم. ويُقصد به التغير طويل الأمد في درجات الحرارة وأنماط الطقس على كوكب الأرض، ويرتبط إلى حد كبير بالنشاط البشري، خاصة الانبعاثات الناتجة عن حرق الوقود الأحفوري مثل الفحم والنفط. أدت هذه الانبعاثات إلى تراكم غازات الاحتباس الحراري في الغلاف الجوي، مما تسبب في ارتفاع درجة حرارة الأرض. من أبرز آثار تغير المناخ: ذوبان الجليد القطبي، وارتفاع مستوى سطح البحر، وتغير نمط الأمطار، وزيادة الظواهر الجوية المتطرفة كالفيضانات والجفاف. وللتصدي لهذا التحدي، تتعاون الدول عبر اتفاقيات دولية مثل اتفاقية باريس، وتتبنى سياسات للحد من الانبعاثات، وتحفيز استخدام الطاقة المتجددة، وتعزيز الوعي البيئي بين الأفراد والمجتمعات"
generated_summary = summarize_paragraph(input_paragraph, model, tokenizer, device, summary_ratio)
print("Generated Summary:", generated_summary)


Generated Summary: تغير المناخ هو من أبرز التحديات التي تواجه البشرية اليوم. ويقصد به التغير طويل الأمد في درجات الحرارة وأنماط الطقس على كوكب الأرض، ويرتبط إلى حد كبير بالنشاط البشري، خاصة الانبعاثات الناتجة عن حرق الوقود الأحفوري مثل الفحم والنفط. من أبرز آثار تغير المناخ: ذوبان الجليد القطبي، وارتفاع مستوى سطح


In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu

def calculate_rouge(predictions, true_summaries):
    rouge = Rouge()
    scores = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}

    for prediction, true_summary in zip(predictions, true_summaries):
        score = rouge.get_scores(prediction['summary'], true_summary)
        scores['rouge-1'].append(score[0]['rouge-1']['f'])
        scores['rouge-2'].append(score[0]['rouge-2']['f'])
        scores['rouge-l'].append(score[0]['rouge-l']['f'])

    avg_scores = {key: sum(value) / len(value) for key, value in scores.items()}
    return avg_scores

def calculate_bleu(predictions, true_summaries):
    bleu_scores = []

    for prediction, true_summary in zip(predictions, true_summaries):
        reference = [true_summary.split()]
        candidate = prediction['summary'].split()
        score = sentence_bleu(reference, candidate)
        bleu_scores.append(score)

    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return avg_bleu_score

true_summaries = [item['summarizer'] for _, item in enumerate(validation_dataset)]

In [None]:
# Calculate ROUGE scores
rouge_scores = calculate_rouge(predictions, true_summaries)

# Calculate BLEU score
bleu_score = calculate_bleu(predictions, true_summaries)

# Print all scores
print(f"ROUGE-1 Score: {rouge_scores['rouge-1']:.2f}")
print(f"ROUGE-2 Score: {rouge_scores['rouge-2']:.2f}")
print(f"ROUGE-L Score: {rouge_scores['rouge-l']:.2f}")
print(f"BLEU Score: {bleu_score:.2f}")



ROUGE-1 Score: 0.84
ROUGE-2 Score: 0.81
ROUGE-L Score: 0.79
BLEU Score: 0.75


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
