In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import re
import logging
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device and hyperparameters
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f'Using device: {DEVICE}')

BATCH_SIZE = 64
MAX_TEXT_LEN = 128
MAX_SUMMARY_LEN = 32
EPOCHS = 10
LEARNING_RATE = 1e-4
EMBEDDING_DIM = 256
HIDDEN_DIM = 512

INFO:__main__:Using device: cuda


In [13]:
def normalize_arabic(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'[ًٌٍَُِّْ]', '', text)  # Remove diacritics
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_and_preprocess_data():
    logger.info("Loading and preprocessing dataset...")
    df = pd.read_excel("Text summarization dataset.xlsx")
    df = df.iloc[1:].reset_index(drop=True)
    df.columns = ['summary', 'text']
    df = df.dropna(subset=['summary', 'text'])
    df['text'] = df['text'].astype(str)
    df['summary'] = df['summary'].astype(str)
    df = df[~df['text'].isin(['nan', ''])]
    df = df[~df['summary'].isin(['nan', ''])]
    df['text'] = df['text'].apply(normalize_arabic)
    df['summary'] = df['summary'].apply(normalize_arabic)
    df = df[df['text'].str.len() > 0]
    df = df[df['summary'].str.len() > 0]
    logger.info(f"Final dataset shape: {df.shape}")
    return df

In [14]:
def prepare_sequences(texts, summaries):
    # Combine all texts and summaries to build vocabulary
    all_texts = list(texts) + list(summaries)
    
    # Create tokenizer with no OOV token
    tokenizer = Tokenizer(filters='', oov_token=None)
    tokenizer.fit_on_texts(all_texts)
    
    # Add special tokens
    tokenizer.word_index['<sos>'] = len(tokenizer.word_index) + 1
    tokenizer.word_index['<eos>'] = len(tokenizer.word_index) + 1
    
    # Convert texts to sequences
    text_sequences = tokenizer.texts_to_sequences(texts)
    summary_sequences = tokenizer.texts_to_sequences(summaries)
    
    # Add <sos> and <eos> to summaries
    summary_sequences = [[tokenizer.word_index['<sos>']] + seq + [tokenizer.word_index['<eos>']] 
                        for seq in summary_sequences]
    
    # Pad sequences
    text_padded = pad_sequences(text_sequences, maxlen=MAX_TEXT_LEN, padding='post')
    summary_padded = pad_sequences(summary_sequences, maxlen=MAX_SUMMARY_LEN, padding='post')
    
    return tokenizer, text_padded, summary_padded

In [15]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.summaries = torch.tensor(summaries, dtype=torch.long)
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx], self.summaries[idx]

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, (hidden, cell)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(DEVICE)
        _, (hidden, cell) = self.encoder(src)
        input = trg[:, 0].unsqueeze(1)
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output.squeeze(1)
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(2)
            input = trg[:, t].unsqueeze(1) if teacher_force else top1
        return outputs

In [16]:
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, optimizer, criterion, scaler, epoch):
    model.train()
    epoch_loss = 0
    
    for src, trg in train_loader:
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()
        with autocast():
            output = model(src, trg)
            
            # Reshape output and target for loss calculation
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            
            loss = criterion(output, trg)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(train_loader)

In [18]:
# Loading the dataset
df = load_and_preprocess_data()

INFO:__main__:Loading and preprocessing dataset...
INFO:__main__:Final dataset shape: (29189, 2)


In [19]:
# Prepare sequences
tokenizer, text_padded, summary_padded = prepare_sequences(df['text'], df['summary'])
vocab_size = len(tokenizer.word_index) + 1

In [20]:
# Split data
train_text, val_text, train_summary, val_summary = train_test_split(
    text_padded, summary_padded, test_size=0.1, random_state=42
)

# Create datasets and dataloaders
train_dataset = SummarizationDataset(train_text, train_summary)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [23]:
# Initialize model
encoder = Encoder(vocab_size, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
decoder = Decoder(vocab_size, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
model = Seq2Seq(encoder, decoder).to(DEVICE)
# Initialize optimizer and loss
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Initialize gradient scaler for mixed precision
scaler = GradScaler()

  scaler = GradScaler()


In [24]:

# Training loop
for epoch in range(EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, scaler, epoch)
    logger.info(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}')
    
    # Save model
    torch.save(model.state_dict(), f"arabic_summarizer_epoch_{epoch}.pth")

  with autocast():


KeyboardInterrupt: 

In [52]:
# Load tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Set VOCAB_SIZE to match tokenizer
VOCAB_SIZE = len(tokenizer.word_index) + 1

def load_model_from_checkpoint(epoch=9):
    encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
    decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)
    model = Seq2Seq(encoder, decoder).to(DEVICE)
    checkpoint_path = f'arabic_summarizer_epoch_{epoch}.pt'
    model.load_state_dict(torch.load(checkpoint_path, map_location=DEVICE))
    model.eval()
    return model

model = load_model_from_checkpoint(epoch=9)

  model.load_state_dict(torch.load(checkpoint_path, map_location=DEVICE))


In [54]:
def generate_summary(model, tokenizer, text, max_length=32):
    model.eval()
    text = normalize_arabic(text)
    text_seq = tokenizer.texts_to_sequences([text])[0]
    text_padded = pad_sequences([text_seq], maxlen=MAX_TEXT_LEN, padding='post')
    text_tensor = torch.tensor(text_padded, dtype=torch.long).to(DEVICE)
    sos_idx = tokenizer.word_index.get('<sos>', 2)
    eos_idx = tokenizer.word_index.get('<eos>', 3)
    decoder_input = torch.tensor([[sos_idx]], dtype=torch.long).to(DEVICE)
    with torch.no_grad():
        _, (hidden, cell) = model.encoder(text_tensor)
    summary = []
    for _ in range(max_length):
        output, hidden, cell = model.decoder(decoder_input, hidden, cell)
        predicted = output.argmax(2)
        pred_idx = predicted.item()
        if pred_idx == eos_idx:
            break
        summary.append(pred_idx)
        decoder_input = predicted
    idx2word = {v: k for k, v in tokenizer.word_index.items()}
    summary_words = [idx2word.get(idx, '') for idx in summary]
    return ' '.join(summary_words)
"ss"

In [55]:
test_text = """

تعتبر التغذية السليمة أساس الصحة الجيدة. يجب أن يحتوي النظام الغذائي اليومي على مجموعة متنوعة من الأطعمة المغذية. الخضروات والفواكه الطازجة توفر الفيتامينات والمعادن الضرورية للجسم. البروتينات الموجودة في اللحوم والأسماك والبقوليات تساعد في بناء العضلات وإصلاح الأنسجة.

من المهم تناول وجبات منتظمة وتجنب الوجبات السريعة الغنية بالدهون والسكريات. شرب الماء بكميات كافية يساعد في الحفاظ على رطوبة الجسم وتحسين عملية الهضم. يجب أيضاً التقليل من المشروبات الغازية والعصائر المحلاة.

تناول وجبة الإفطار يعتبر من أهم العادات الصحية. فهي تمد الجسم بالطاقة اللازمة لبدء اليوم بنشاط. من المهم أيضاً تناول وجبات خفيفة صحية بين الوجبات الرئيسية للحفاظ على مستوى الطاقة في الجسم.
"""
summary = generate_summary(model, tokenizer, test_text)
print(f"Generated Summary: {summary}")

Generated Summary: تناول كميه وفيره من الماء تناول تناول من تناول تناول من تناول تناول من تناول
