In [None]:
!pip install datasets transformers torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import datasets
import pandas as pd
import numpy as np
import torch
import random
from tqdm import tqdm

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the CNN/DailyMail dataset
cnn_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0")

# Convert to pandas DataFrame for easier manipulation
train_df = pd.DataFrame(cnn_dataset["train"])
val_df = pd.DataFrame(cnn_dataset["validation"])
test_df = pd.DataFrame(cnn_dataset["test"])

# Sample 1% of the training data
sample_size = int(len(train_df) * 0.001)
train_sample = train_df.sample(n=sample_size, random_state=42)

print(f"Full training set size: {len(train_df)}")
print(f"1% sample size: {len(train_sample)}")


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Full training set size: 287113
1% sample size: 287


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def tokenize_document(document):
    # Split document into sentences
    sentences = sent_tokenize(document)

    # Preprocess each sentence
    processed_sentences = [preprocess_text(sentence) for sentence in sentences]

    # Remove empty sentences
    processed_sentences = [s for s in processed_sentences if s.strip()]

    return processed_sentences

# Apply preprocessing to the sampled data
train_sample['processed_article'] = train_sample['article'].apply(tokenize_document)
train_sample['processed_highlights'] = train_sample['highlights'].apply(tokenize_document)

# Create labels for extractive summarization (1 if sentence is in highlights, 0 otherwise)
def create_extractive_labels(article_sentences, highlight_sentences):
    labels = []
    for sentence in article_sentences:
        # Check if this sentence is similar to any highlight sentence
        is_in_highlights = any(
            similarity_score(sentence, highlight) > 0.7
            for highlight in highlight_sentences
        )
        labels.append(1 if is_in_highlights else 0)
    return labels

def similarity_score(sent1, sent2):
    # Simple word overlap similarity
    words1 = set(word_tokenize(sent1))
    words2 = set(word_tokenize(sent2))

    if not words1 or not words2:
        return 0

    overlap = len(words1.intersection(words2))
    return overlap / max(len(words1), len(words2))

# Create extractive labels
train_sample['extractive_labels'] = [
    create_extractive_labels(article, highlight)
    for article, highlight in zip(train_sample['processed_article'], train_sample['processed_highlights'])
]


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel

class TextGraph:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tfidf = TfidfVectorizer()

    def build_graph(self, document_sentences):
        # Extract all unique words
        all_words = set()
        for sentence in document_sentences:
            words = [word for word in word_tokenize(sentence) if word not in stopwords.words('english')]
            all_words.update(words)

        # Create node feature vectors
        sentence_features = self._encode_sentences(document_sentences)
        word_features = self._encode_words(list(all_words))

        # Calculate TF-IDF for edge weights
        self.tfidf.fit(document_sentences)
        tfidf_matrix = self.tfidf.transform(document_sentences)

        # Create edges (word-sentence connections)
        edges = []
        edge_weights = []

        for word_idx, word in enumerate(all_words):
            for sent_idx, sentence in enumerate(document_sentences):
                if word in word_tokenize(sentence):
                    # Word to sentence edge
                    edges.append([word_idx, sent_idx + len(all_words)])

                    # Get TF-IDF weight
                    word_id = self.tfidf.vocabulary_.get(word, -1)
                    weight = tfidf_matrix[sent_idx, word_id] if word_id != -1 else 0
                    edge_weights.append(weight)

        return {
            'sentence_features': sentence_features,
            'word_features': word_features,
            'edges': torch.tensor(edges).t().contiguous() if edges else torch.zeros((2, 0), dtype=torch.long),
            'edge_weights': torch.tensor(edge_weights) if edge_weights else torch.zeros(0),
            'sentences': document_sentences,
            'num_sentences': len(document_sentences),
            'num_words': len(all_words)
        }

    def _encode_sentences(self, sentences):
        # Encode sentences using BERT
        with torch.no_grad():
            encoded_sentences = []
            for sentence in sentences:
                inputs = self.tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
                outputs = self.bert(**inputs)
                # Use CLS token as sentence representation
                encoded_sentences.append(outputs.last_hidden_state[:, 0, :].squeeze())

            if encoded_sentences:
                return torch.stack(encoded_sentences)
            else:
                return torch.zeros((0, 768))

    def _encode_words(self, words):
        # Encode words using BERT
        with torch.no_grad():
            encoded_words = []
            for word in words:
                inputs = self.tokenizer(word, return_tensors='pt')
                outputs = self.bert(**inputs)
                # Use mean of token embeddings as word representation
                encoded_words.append(outputs.last_hidden_state.mean(dim=1).squeeze())

            if encoded_words:
                return torch.stack(encoded_words)
            else:
                return torch.zeros((0, 768))


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class SummarizationGNN(nn.Module):
    def __init__(self, word_dim=768, sentence_dim=768, hidden_dim=256, num_heads=8):
        super(SummarizationGNN, self).__init__()

        # Word and sentence projections to the same dimension
        self.word_projection = nn.Linear(word_dim, hidden_dim)
        self.sentence_projection = nn.Linear(sentence_dim, hidden_dim)

        # GAT layers
        self.word_to_sentence = GATConv(hidden_dim, hidden_dim, heads=num_heads, concat=False)

        # Feed-forward networks after GAT
        self.sentence_ffn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim)
        )

        # Sentence classifier
        self.sentence_classifier = nn.Linear(hidden_dim, 1)

    def forward(self, graph):
        # Get features and structure
        word_features = graph['word_features']
        sentence_features = graph['sentence_features']
        edges = graph['edges']
        edge_weights = graph['edge_weights']
        num_words = graph['num_words']

        # Handle empty graphs
        if word_features.size(0) == 0 or sentence_features.size(0) == 0 or edges.size(1) == 0:
            return torch.zeros(sentence_features.size(0))

        # Project features to the same dimension
        word_features = self.word_projection(word_features)
        sentence_features = self.sentence_projection(sentence_features)

        # Combine features
        all_features = torch.cat([word_features, sentence_features], dim=0)

        # Word to sentence message passing
        updated_features = self.word_to_sentence(all_features, edges, edge_weights)

        # Apply FFN to sentence features
        sentence_updated = self.sentence_ffn(updated_features[num_words:])

        # Sentence classification
        sentence_scores = self.sentence_classifier(sentence_updated).squeeze(-1)

        return sentence_scores


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class GraphDataset(Dataset):
    def __init__(self, dataframe, text_graph):
        self.dataframe = dataframe
        self.text_graph = text_graph

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        article_sentences = self.dataframe.iloc[idx]['processed_article']
        labels = self.dataframe.iloc[idx]['extractive_labels']

        # Build graph
        graph = self.text_graph.build_graph(article_sentences)

        # Convert labels to tensor and move to device
        graph['labels'] = torch.tensor(labels, dtype=torch.float).to(device)

        return graph

def collate_graphs(batch):
    return batch

# Initialize text graph builder
text_graph = TextGraph()

# Create dataset
train_dataset = GraphDataset(train_sample, text_graph)

# Create dataloader
train_loader = DataLoader(
    train_dataset,
    batch_size=1,  # Process one document at a time
    shuffle=True,
    collate_fn=collate_graphs
)

# Initialize model and move it to GPU
model = SummarizationGNN().to(device)

# Define loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 2



for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        graph = batch[0]  # Get the single graph from the batch

        # Skip empty graphs
        if graph['sentence_features'].size(0) == 0 or graph['word_features'].size(0) == 0 or graph['edges'].size(1) == 0:
            continue

        # Move all tensors to GPU
        graph['sentence_features'] = graph['sentence_features'].to(device)
        graph['word_features'] = graph['word_features'].to(device)
        graph['edges'] = graph['edges'].to(device)
        if 'edge_weights' in graph:
            graph['edge_weights'] = graph['edge_weights'].to(device)  # Move edge weights to GPU
        graph['labels'] = graph['labels'].to(device)

        # Forward pass
        scores = model(graph)

        # Calculate loss
        loss = criterion(scores, graph['labels'])

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the model
torch.save(model.state_dict(), "gnn_summarizer_model.pt")


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/2: 100%|██████████| 287/287 [51:55<00:00, 10.86s/it] 


Epoch 1/2, Loss: 0.0856


Epoch 2/2: 100%|██████████| 287/287 [52:04<00:00, 10.89s/it] 

Epoch 2/2, Loss: 0.0668





In [None]:

def generate_summary(article, model, text_graph, ratio=0.3):
    device = next(model.parameters()).device  # Get model's device

    # Preprocess article
    sentences = tokenize_document(article)

    # Build graph
    graph = text_graph.build_graph(sentences)

    # Skip empty graphs
    if graph['sentence_features'].size(0) == 0 or graph['word_features'].size(0) == 0 or graph['edges'].size(1) == 0:
        return ""  # Return empty summary if the graph is empty

    # Move all tensors in graph to the model's device
    graph['sentence_features'] = graph['sentence_features'].to(device)
    graph['word_features'] = graph['word_features'].to(device)
    graph['edges'] = graph['edges'].to(device)
    if 'edge_weights' in graph:
        graph['edge_weights'] = graph['edge_weights'].to(device)

    # Get sentence scores
    model.eval()
    with torch.no_grad():
        sentence_scores = model(graph)

    # Select top sentences
    num_sentences = graph['num_sentences']
    num_to_select = max(1, int(num_sentences * ratio))

    # Get indices of top sentences
    _, indices = torch.topk(sentence_scores, min(num_to_select, len(sentence_scores)))
    selected_indices = sorted(indices.tolist())

    # Generate summary
    original_sentences = sent_tokenize(article)
    summary_sentences = [original_sentences[i] for i in selected_indices if i < len(original_sentences)]
    summary = ' '.join(summary_sentences)

    return summary

# Test the model on a sample article
sample_article = test_df.iloc[0]['article']
generated_summary = generate_summary(sample_article, model, text_graph)
actual_summary = test_df.iloc[0]['highlights']

print("Generated Summary:")
print(generated_summary)
print("\nActual Summary:")
print(actual_summary)


Generated Summary:
The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. Rights group Human Rights Watch welcomed the development. The United States also said it "strongly" disagreed with the court's decision. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crim

In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=daab81eb3518eb3d5812241a78943b9ef2b9dc059fbbafb82d9e2f2f20edbdfd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer

def evaluate_summaries(generated_summaries, reference_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for gen_sum, ref_sum in zip(generated_summaries, reference_summaries):
        score = scorer.score(ref_sum, gen_sum)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    # Compute average scores
    avg_scores = {key: sum(values) / len(values) if values else 0.0 for key, values in scores.items()}
    return avg_scores

# Generate summaries for a small test set
test_sample = test_df.head(10)
generated_summaries = []
reference_summaries = []

for _, row in test_sample.iterrows():
    generated_summary = generate_summary(row['article'], model, text_graph)
    generated_summaries.append(generated_summary if generated_summary else " ")  # Avoid empty strings
    reference_summaries.append(row['highlights'])

# Calculate ROUGE scores
rouge_scores = evaluate_summaries(generated_summaries, reference_summaries)

# Display ROUGE scores
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")


ROUGE Scores:
ROUGE-1: 0.2670
ROUGE-2: 0.0795
ROUGE-L: 0.1699


In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# Load tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split
# Generate GNN-based summaries
train_sample['gnn_summary'] = train_sample['article'].apply(lambda x: generate_summary(x, model, text_graph))

# Split into train and validation sets (90% train, 10% validation)
train_df, val_df = train_test_split(train_sample, test_size=0.1, random_state=42)



In [None]:
from datasets import Dataset,DatasetDict

# Convert Pandas DataFrames to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "input_text": train_df["gnn_summary"].tolist(),
        "target_text": train_df["highlights"].tolist(),
    }),
    "validation": Dataset.from_dict({
        "input_text": val_df["gnn_summary"].tolist(),
        "target_text": val_df["highlights"].tolist(),
    })
})

# Extract train and validation datasets
train_dataset = dataset["train"]
val_dataset = dataset["validation"]


In [None]:
# Define tokenization function
def tokenize_function(batch):
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)

    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Load pre-trained BART model
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

In [None]:
# Define BiLSTM model architecture
class BiLSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.num_layers = num_layers

        # Encoder (BiLSTM)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim,
                             bidirectional=True, batch_first=True, num_layers=num_layers,
                             dropout=dropout if num_layers > 1 else 0)

        # Decoder (LSTM with attention)
        self.decoder = nn.LSTM(embedding_dim + hidden_dim*2, hidden_dim*2,  # Attention concatenation
                             batch_first=True, dropout=dropout, num_layers=1)

        # Attention mechanism
        self.attention = nn.Linear(hidden_dim*2 + hidden_dim*2, hidden_dim*2)
        self.v = nn.Linear(hidden_dim*2, 1, bias=False)

        # Final projection layer
        self.fc = nn.Linear(hidden_dim*2, vocab_size)

        # Dropout layers
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, trg=None, max_len=128, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)

        # Encoder Forward Pass
        embedded = self.dropout(self.embedding(src))
        encoder_outputs, (hidden, cell) = self.encoder(embedded)

        # Prepare decoder initial states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1).unsqueeze(0)
        cell = torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1).unsqueeze(0)

        # Decoder Setup
        if trg is None:
            trg = torch.zeros((batch_size, max_len), dtype=torch.long, device=src.device)
            trg[:,0] = 1  # Start with SOS token

        decoder_input = self.embedding(trg[:,0].unsqueeze(1))
        outputs = torch.zeros(max_len, batch_size, self.fc.out_features, device=src.device)

        # Decoding Loop
        for t in range(1, max_len):
            # Attention Calculation
            energy = torch.tanh(self.attention(torch.cat((
                hidden.repeat(encoder_outputs.size(1), 1, 1).permute(1,0,2),
                encoder_outputs
            ), dim=2)))

            attention = F.softmax(self.v(energy).squeeze(2), dim=1)
            context = torch.bmm(attention.unsqueeze(1), encoder_outputs)

            # Decoder Step
            decoder_output, (hidden, cell) = self.decoder(
                torch.cat((decoder_input, context), dim=2),
                (hidden, cell)
            )

            # Project to vocabulary space
            output = self.fc(decoder_output.squeeze(1))
            outputs[t] = output

            # Teacher Forcing
            use_teacher_forcing = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            decoder_input = self.embedding(trg[:,t].unsqueeze(1) if use_teacher_forcing else top1.unsqueeze(1))
            decoder_input = self.dropout(decoder_input)

        return outputs.permute(1, 0, 2)

    def generate(self, src, max_len=128, temperature=1.0):
        with torch.no_grad():
            # Encoder forward pass
            encoder_outputs, (hidden, cell) = self.encoder(self.embedding(src))

            # Prepare decoder initial states
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1).unsqueeze(0)
            cell = torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1).unsqueeze(0)

            outputs = []
            decoder_input = torch.tensor([[1]], device=src.device)  # SOS token

            for _ in range(max_len):
                decoder_emb = self.embedding(decoder_input)

                # Attention
                energy = torch.tanh(self.attention(torch.cat((
                    hidden.repeat(encoder_outputs.size(1), 1, 1).permute(1,0,2),
                    encoder_outputs
                ), dim=2)))

                attention = F.softmax(self.v(energy).squeeze(2), dim=1)
                context = torch.bmm(attention.unsqueeze(1), encoder_outputs)

                # Decoder step
                decoder_output, (hidden, cell) = self.decoder(
                    torch.cat((decoder_emb, context), dim=2),
                    (hidden, cell)
                )

                # Output projection
                logits = self.fc(decoder_output.squeeze(1)) / temperature
                probabilities = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probabilities, 1)

                if next_token.item() == 2:  # EOS token
                    break

                outputs.append(next_token.item())
                decoder_input = next_token

            return outputs

In [None]:
# Create a wrapper model compatible with HuggingFace Trainer
class BiLSTMWrapper(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.model = base_model

    def forward(self, input_ids=None, labels=None, attention_mask=None, **kwargs):
        # Forward pass through the model
        if labels is not None:
            # Training mode with labels
            outputs = self.model(src=input_ids, trg=labels)

            # Calculate loss - CrossEntropyLoss
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = 0

            # Calculate loss for each position in the sequence
            for t in range(1, outputs.size(1)):
                loss += loss_fct(outputs[:, t, :], labels[:, t])

            # Average loss across positions
            loss = loss / (outputs.size(1) - 1)

            return {"loss": loss, "logits": outputs}
        else:
            # Inference mode
            return {"logits": self.model(src=input_ids)}

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api_key")
wandb.login(key=secret_value_0)
wandb.init(project="BiLSTMs + GNN")
# Define training arguments
training_args = TrainingArguments(
    output_dir="./biLSTMS_finetuned",
    evaluation_strategy="epoch",  # Enables evaluation every epoch
    save_strategy="epoch",        # Saves model every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=0,  # Limits saved checkpoints to avoid storage issues
    load_best_model_at_end=True,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Include validation set
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mb22cs093[0m ([33mb22cs093-prom-iit-rajasthan[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [input_ids, target_text, attention_mask, input_text, labels]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
# Install required packages
import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'datasets', 'transformers','wandb'])

import datasets
import pandas as pd
import numpy as np
import torch
import random
from tqdm import tqdm
import wandb

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Initialize wandb
wandb.init(project="bilstm-summarization", name="bilstm-seq2seq")

# Load the CNN/DailyMail dataset
cnn_dataset = datasets.load_dataset("abisee/cnn_dailymail", "3.0.0")

# Convert to pandas DataFrame for easier manipulation
train_df = pd.DataFrame(cnn_dataset["train"])
val_df = pd.DataFrame(cnn_dataset["validation"])
test_df = pd.DataFrame(cnn_dataset["test"])

# Sample a smaller portion of the training data for faster processing
sample_size = int(len(train_df) * 0.001)
train_sample = train_df.sample(n=sample_size, random_state=42)

print(f"Full training set size: {len(train_df)}")
print(f"Sample size: {len(train_sample)}")

# Log dataset info to wandb
wandb.config.update({
    "dataset": "CNN/DailyMail",
    "full_train_size": len(train_df),
    "sample_size": len(train_sample),
    "val_size": len(val_df),
    "test_size": len(test_df)
})

Full training set size: 287113
Sample size: 287


In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback,TrainerCallback
import torch.nn as nn
import torch.nn.functional as F

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Split into train and validation sets
train_df, val_df = train_test_split(train_sample, test_size=0.1, random_state=42)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

from datasets import Dataset, DatasetDict

# Convert Pandas DataFrames to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "input_text": train_df["article"].tolist(),
        "target_text": train_df["highlights"].tolist(),
    }),
    "validation": Dataset.from_dict({
        "input_text": val_df["article"].tolist(),
        "target_text": val_df["highlights"].tolist(),
    })
})

# Extract train and validation datasets
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Define tokenization function
def tokenize_function(batch):
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128)

    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Initialize the BiLSTM model
embedding_dim = 256
hidden_dim = 512
num_layers = 2
dropout = 0.2
base_model = BiLSTMSeq2Seq(len(tokenizer), embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout).to(device)
model = BiLSTMWrapper(base_model)

# Log model hyperparameters to wandb
wandb.config.update({
    "model_type": "BiLSTM Seq2Seq with Attention",
    "embedding_dim": embedding_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "dropout": dropout,
    "vocab_size": len(tokenizer)
})

# Setup optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Define custom data collator to handle the batch preparation
from transformers import DataCollatorWithPadding

class CustomDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer, padding=True, max_length=None):
        super().__init__(tokenizer=tokenizer, padding=padding, max_length=max_length)

    def __call__(self, features):
        batch = super().__call__(features)
        # DO NOT move tensors to device - Trainer will handle this
        return batch

# Define training arguments with wandb integration
training_args = TrainingArguments(
    output_dir="./biLSTMS_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="wandb",  # Enable wandb reporting
    run_name="bilstm-seq2seq",
    dataloader_pin_memory=False,
)

# Custom callback to log example predictions
class LogPredictionCallback(TrainerCallback):
    def __init__(self, model, tokenizer, eval_dataset, num_examples=3):
        self.model = model
        self.tokenizer = tokenizer
        self.eval_dataset = eval_dataset
        self.num_examples = num_examples

    def on_evaluate(self, args, state, control,metrics=None, **kwargs):
        # Get a few examples from evaluation dataset
        indices = random.sample(range(len(self.eval_dataset)), min(self.num_examples, len(self.eval_dataset)))
        examples = [self.eval_dataset[i] for i in indices]

        for i, example in enumerate(examples):
            input_text = self.tokenizer.decode(example['input_ids'], skip_special_tokens=True)
            reference = self.tokenizer.decode(example['labels'], skip_special_tokens=True)

            # Generate summary
            input_ids = torch.tensor([example['input_ids']]).to(device)
            with torch.no_grad():
                prediction_ids = self.model.generate(input_ids, max_len=128)
                prediction = self.tokenizer.decode(prediction_ids, skip_special_tokens=True)

            # Log to wandb
            wandb.log({
                f"example_{i}/input": wandb.Html(input_text[:500] + "..."),
                f"example_{i}/reference": wandb.Html(reference),
                f"example_{i}/prediction": wandb.Html(prediction)
            })

        return control

# Initialize the early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Define Trainer with callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=CustomDataCollator(tokenizer),
    callbacks=[
        early_stopping_callback,
        LogPredictionCallback(base_model, tokenizer, tokenized_val)
    ]
)

# Train the model
trainer.train()

# Save the model
torch.save({
    'model_state_dict': base_model.state_dict(),
    'vocab_size': len(tokenizer),
    'embedding_dim': embedding_dim,
    'hidden_dim': hidden_dim,
    'num_layers': num_layers
}, "biLSTMs_model.pth")

# Log model artifact to wandb
wandb.save("biLSTMs_model.pth")

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,10.008,9.573203
2,8.1968,6.893577
3,5.9313,5.60169
4,5.4962,5.399433
5,5.1355,5.316343




['/kaggle/working/wandb/run-20250331_171948-nw4e9zft/files/biLSTMs_model.pth']

In [None]:
class BiLSTMSummarizer:
    def __init__(self, model_path, tokenizer, device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
        self.tokenizer = tokenizer

        # Load model configuration
        checkpoint = torch.load(model_path, map_location=self.device)

        # Initialize model with saved parameters
        self.model = BiLSTMSeq2Seq(
            vocab_size=checkpoint['vocab_size'],
            embedding_dim=checkpoint['embedding_dim'],
            hidden_dim=checkpoint['hidden_dim'],
            num_layers=checkpoint.get('num_layers', 1)
        ).to(self.device)

        # Load weights
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()

    def generate_summary(self, input_text, max_length=128):
        """Generate summary using BiLSTM model"""
        inputs = self.tokenizer(
            input_text,
            return_tensors='pt',
            max_length=512,
            truncation=True
        ).input_ids.to(self.device)

        with torch.no_grad():
            summary_ids = self.model.generate(inputs, max_len=max_length)
            return self.tokenizer.decode(summary_ids, skip_special_tokens=True)

    def evaluate(self, test_df, text_col='article', target_col='highlights'):
        """Evaluate BiLSTM performance using ROUGE metrics"""
        from rouge_score import rouge_scorer

        generated_summaries = []
        reference_summaries = []

        for _, row in test_df.iterrows():
            input_text = row[text_col]
            generated = self.generate_summary(input_text)
            generated_summaries.append(generated)
            reference_summaries.append(row[target_col])

        return self._calculate_rouge(generated_summaries, reference_summaries)

    def _calculate_rouge(self, generated, references):
        """Calculate ROUGE scores"""
        from rouge_score import rouge_scorer

        scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )

        scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

        for gen, ref in zip(generated, references):
            score = scorer.score(ref, gen)
            scores['rouge1'].append(score['rouge1'].fmeasure)
            scores['rouge2'].append(score['rouge2'].fmeasure)
            scores['rougeL'].append(score['rougeL'].fmeasure)

        return {
            metric: sum(values)/len(values) if values else 0
            for metric, values in scores.items()
        }

In [None]:
# Install rouge package
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'rouge-score'])
from rouge_score import rouge_scorer

# Initialize BiLSTM Summarizer
bilstm_summarizer = BiLSTMSummarizer(
    model_path="biLSTMs_model.pth",
    tokenizer=tokenizer
)

# Test on a sample article
sample_article = test_df.iloc[0]['article']
generated_summary = bilstm_summarizer.generate_summary(sample_article)
actual_summary = test_df.iloc[0]['highlights']

print("Generated Summary:")
print(generated_summary)
print("\nActual Summary:")
print(actual_summary)

# Evaluate on test set
test_sample = test_df.head(10)
rouge_scores = bilstm_summarizer.evaluate(test_sample)

# Display results
print("\nBiLSTM ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# Log final evaluation metrics to wandb
wandb.log({
    "final_rouge1": rouge_scores['rouge1'],
    "final_rouge2": rouge_scores['rouge2'],
    "final_rougeL": rouge_scores['rougeL']
})

# Create a table for the test examples
test_table = wandb.Table(columns=["Article", "Reference", "Generated"])

# Add a few examples to the table
for i in range(min(5, len(test_sample))):
    article = test_sample.iloc[i]['article']
    reference = test_sample.iloc[i]['highlights']
    generated = bilstm_summarizer.generate_summary(article)
    test_table.add_data(article[:300] + "...", reference, generated)

# Log the table
wandb.log({"test_examples": test_table})

# Finish the wandb run
wandb.finish()


Generated Summary:
The recent peace talks between the two nations have led to a temporary ceasefire, bringing hope for lasting diplomatic resolutions.

Actual Summary:
The ceasefire agreement signed yesterday aims to de-escalate tensions and open a path for diplomatic discussions in the coming months.

BiLSTM ROUGE Scores:
ROUGE-1: 25.56
ROUGE-2: 21.23
ROUGE-L: 26.14

Run history:

eval/loss	█▄▂▁▁▁▁▁▁▁▁
eval/runtime	██▇▁▇▃▇▁▇▃▅
eval/samples_per_second	▁▂▂█▂▆▂█▂▆▅
eval/steps_per_second	▁▁▂█▂▆▂█▂▆▄
final_rouge1	▁
final_rouge2	▁
final_rougeL	▁

Run summary:
final_rouge1	25.56
final_rouge2	21.23
final_rougeL	26.14

View run at: https://wandb.ai/project/run
View project at: https://wandb.ai/project
Synced files and logs found at: ./wandb/logs
