In [1]:
# importing necessary library
import os
# importing necessary library
import kagglehub
# importing necessary library
import pandas as pd
# importing necessary library
import torch
# importing necessary library
import torch.nn as nn
# importing necessary library
import torch.nn.functional as F
# importing necessary library
import math
# importing necessary library
import copy

In [33]:
from datasets import load_dataset

# Load a small portion (1%) of the scientific papers (arXiv) dataset
dataset = load_dataset("scientific_papers", "arxiv", split="train[:1%]")


In [34]:
print(dataset[0].keys())


dict_keys(['article', 'abstract', 'section_names'])


In [37]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Training tokenizer from scratch using article → abstract
with open("corpus.txt", "w", encoding="utf-8") as f:
    for sample in dataset:
        f.write(sample['article'].replace("\n", " ") + "\n")   # input
        f.write(sample['abstract'].replace("\n", " ") + "\n")  # target/summary

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files="corpus.txt",
    vocab_size=30522,
    min_frequency=2,
    special_tokens=["<s>", "</s>", "<pad>", "<unk>"]
)
import os
os.makedirs("custom_tokenizer", exist_ok=True)
tokenizer.save_model("custom_tokenizer")

tokenizer = ByteLevelBPETokenizer(
    "custom_tokenizer/vocab.json",
    "custom_tokenizer/merges.txt"
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>"))
)
tokenizer.enable_truncation(max_length=128)


In [35]:
def preprocess(example):
    input_ids = tokenizer.encode(example["article"]).ids[:128]
    target_ids = tokenizer.encode(example["abstract"]).ids[:32]

    input_ids += [tokenizer.token_to_id("<pad>")] * (128 - len(input_ids))
    target_ids = [tokenizer.token_to_id("<s>")] + target_ids + [tokenizer.token_to_id("</s>")]
    target_ids += [tokenizer.token_to_id("<pad>")] * (34 - len(target_ids))

    return {"input_ids": input_ids, "labels": target_ids}


In [36]:
dataset = dataset.map(preprocess)



Map:   0%|          | 0/2030 [00:00<?, ? examples/s]

In [38]:
print(dataset[0].keys())


dict_keys(['article', 'abstract', 'section_names', 'input_ids', 'labels'])


In [39]:
# Utility: Clone layers
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [40]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Creating constant 'pe' matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Adding positional encoding
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [41]:
# Layer Normalization
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [42]:
# Sublayer Connection (Residual + Norm)
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [43]:
# Multi-Head Attention
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = torch.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [44]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        query, key, value = [
            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for l, x in zip(self.linears, (query, key, value))
        ]

        x, self.attn = attention(query, key, value, mask, self.dropout)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [45]:
# Position-wise Feed Forward
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [46]:
# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [47]:
# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [48]:
# Encoder and Decoder Stacks
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [49]:
# Embeddings with Positional Encoding
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [50]:
# Full model
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [51]:
# Generator
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

# Mask to block future positions
def subsequent_mask(size):
    attn_shape = (1, size, size)
    mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    return mask == 0

In [87]:
# Build full model
def make_model(vocab_size, N=6, d_model=768, d_ff=3072, h=8, dropout=0.2):
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, attn, ff, dropout), N),
        Decoder(DecoderLayer(d_model, attn, attn, ff, dropout), N),
        nn.Sequential(Embeddings(d_model, vocab_size), position),
        nn.Sequential(Embeddings(d_model, vocab_size), position),
        Generator(d_model, vocab_size)
    )
    return model


In [55]:
def subsequent_mask(size):
    "Mask out subsequent positions (for auto-regressive decoding)"
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    return subsequent_mask == 0


In [53]:
from torch.utils.data import Dataset, DataLoader

class ArxivDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        src = torch.tensor(item["input_ids"])
        tgt = torch.tensor(item["labels"])
        return src, tgt

# Create dataset and dataloader
dataset = ArxivDataset(dataset)  # ← your preprocessed HuggingFace dataset
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [88]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Get vocab size and pad ID
vocab_size = tokenizer.get_vocab_size()
pad_token_id = tokenizer.token_to_id("<pad>")

# Build model
model = make_model(vocab_size).to("cuda" if torch.cuda.is_available() else "cpu")

# Loss and optimizer
loss_fn = nn.NLLLoss(ignore_index=pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 50

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    for src, tgt in tqdm(train_loader):
     src = src.to(device)
     tgt = tgt.to(device)

     tgt_input = tgt[:, :-1]
     tgt_output = tgt[:, 1:]

    # Create masks
     src_mask = (src != pad_token_id).unsqueeze(-2)  # shape: [batch, 1, seq_len]
     tgt_mask = (tgt_input != pad_token_id).unsqueeze(-2)  # shape: [batch, 1, tgt_len]
     tgt_mask = tgt_mask & subsequent_mask(tgt_input.size(-1)).to(device)

    # Forward pass with masks
     out = model(src, tgt_input, src_mask, tgt_mask)
     logits = model.generator(out)

     logits = logits.view(-1, logits.size(-1))
     tgt_output = tgt_output.contiguous().view(-1)

     loss = loss_fn(logits, tgt_output)
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()

     total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")



Epoch 1/50


100%|██████████| 254/254 [00:32<00:00,  7.71it/s]


Average loss: 6.9558

Epoch 2/50


100%|██████████| 254/254 [00:32<00:00,  7.75it/s]


Average loss: 5.7299

Epoch 3/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 5.3794

Epoch 4/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 5.1046

Epoch 5/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 4.8720

Epoch 6/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 4.6610

Epoch 7/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 4.4675

Epoch 8/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 4.2920

Epoch 9/50


100%|██████████| 254/254 [00:32<00:00,  7.77it/s]


Average loss: 4.1247

Epoch 10/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 3.9667

Epoch 11/50


100%|██████████| 254/254 [00:32<00:00,  7.82it/s]


Average loss: 3.8236

Epoch 12/50


100%|██████████| 254/254 [00:32<00:00,  7.77it/s]


Average loss: 3.6873

Epoch 13/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 3.5615

Epoch 14/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 3.4354

Epoch 15/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 3.3142

Epoch 16/50


100%|██████████| 254/254 [00:32<00:00,  7.76it/s]


Average loss: 3.2044

Epoch 17/50


100%|██████████| 254/254 [00:32<00:00,  7.76it/s]


Average loss: 3.0922

Epoch 18/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 2.9892

Epoch 19/50


100%|██████████| 254/254 [00:32<00:00,  7.82it/s]


Average loss: 2.8902

Epoch 20/50


100%|██████████| 254/254 [00:32<00:00,  7.77it/s]


Average loss: 2.7890

Epoch 21/50


100%|██████████| 254/254 [00:32<00:00,  7.81it/s]


Average loss: 2.6951

Epoch 22/50


100%|██████████| 254/254 [00:32<00:00,  7.84it/s]


Average loss: 2.6025

Epoch 23/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 2.5116

Epoch 24/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 2.4167

Epoch 25/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 2.3312

Epoch 26/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 2.2414

Epoch 27/50


100%|██████████| 254/254 [00:32<00:00,  7.83it/s]


Average loss: 2.1555

Epoch 28/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 2.0715

Epoch 29/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 1.9855

Epoch 30/50


100%|██████████| 254/254 [00:32<00:00,  7.82it/s]


Average loss: 1.9006

Epoch 31/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 1.8189

Epoch 32/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 1.7381

Epoch 33/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 1.6520

Epoch 34/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 1.5734

Epoch 35/50


100%|██████████| 254/254 [00:32<00:00,  7.77it/s]


Average loss: 1.4972

Epoch 36/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 1.4237

Epoch 37/50


100%|██████████| 254/254 [00:32<00:00,  7.82it/s]


Average loss: 1.3459

Epoch 38/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 1.2804

Epoch 39/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 1.2106

Epoch 40/50


100%|██████████| 254/254 [00:32<00:00,  7.76it/s]


Average loss: 1.1476

Epoch 41/50


100%|██████████| 254/254 [00:33<00:00,  7.58it/s]


Average loss: 1.0802

Epoch 42/50


100%|██████████| 254/254 [00:33<00:00,  7.69it/s]


Average loss: 1.0174

Epoch 43/50


100%|██████████| 254/254 [00:32<00:00,  7.73it/s]


Average loss: 0.9569

Epoch 44/50


100%|██████████| 254/254 [00:32<00:00,  7.71it/s]


Average loss: 0.9067

Epoch 45/50


100%|██████████| 254/254 [00:32<00:00,  7.77it/s]


Average loss: 0.8559

Epoch 46/50


100%|██████████| 254/254 [00:32<00:00,  7.80it/s]


Average loss: 0.8053

Epoch 47/50


100%|██████████| 254/254 [00:32<00:00,  7.82it/s]


Average loss: 0.7576

Epoch 48/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]


Average loss: 0.7153

Epoch 49/50


100%|██████████| 254/254 [00:32<00:00,  7.78it/s]


Average loss: 0.6735

Epoch 50/50


100%|██████████| 254/254 [00:32<00:00,  7.79it/s]

Average loss: 0.6408





In [89]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_symbol):
    model.eval()
    memory = model.encode(src, src_mask)

    # Start with <s> token
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src).to(src.device)

    for _ in range(max_len - 1):
        tgt_mask = subsequent_mask(ys.size(1)).to(src.device).unsqueeze(1)
        out = model.decode(memory, src_mask, ys, tgt_mask)

        # Use sampling instead of argmax
        probs = torch.softmax(model.generator(out[:, -1]), dim=-1)
        next_word = torch.multinomial(probs, num_samples=1)

        ys = torch.cat([ys, next_word], dim=1)

        # Stop decoding if </s> token is generated
        if next_word.item() == eos_symbol:
            break

    return ys


In [90]:
def generate_summary(model, input_text, tokenizer, max_len=64):
    model.eval()

    # Tokenize the input text (article)
    input_ids = tokenizer.encode(input_text).ids[:128]
    input_ids += [tokenizer.token_to_id("<pad>")] * (128 - len(input_ids))
    src = torch.tensor([input_ids]).to(next(model.parameters()).device)
    src_mask = (src != tokenizer.token_to_id("<pad>")).unsqueeze(-2)

    # Special token IDs
    start_symbol = tokenizer.token_to_id("<s>")
    eos_symbol = tokenizer.token_to_id("</s>")

    # Decode
    decoded_ids = greedy_decode(model, src, src_mask, max_len, start_symbol, eos_symbol)

    # Remove padding and decode to string
    token_ids = [t for t in decoded_ids[0].tolist() if t not in [
    tokenizer.token_to_id("<pad>"),
    tokenizer.token_to_id("<s>"),
    tokenizer.token_to_id("</s>")]]

    return tokenizer.decode(token_ids)

In [76]:
from datasets import load_dataset
raw_dataset = load_dataset("scientific_papers", "arxiv", split="train[:1%]")


In [77]:
sample_abstract = raw_dataset[1]["article"]
generated_title = generate_summary(model, sample_abstract, tokenizer)
print("Generated Title:", generated_title)


Generated Title:  a model large class - electron systems with @xmath1 and in the _ 
 the @xmath1


In [1]:
from datasets import load_dataset
raw_dataset = load_dataset("scientific_papers", "arxiv", split="train[:1%]")

for i in range(5):
    print(f"\nSample {i+1}")
    article = raw_dataset[i]["article"]
    target = raw_dataset[i]["abstract"]
    predicted = generate_summary(model, article, tokenizer)

    print(" Reference:", target[:200], "...")
    print(" Generated:", predicted)
    print("-" * 80)


ModuleNotFoundError: No module named 'datasets'

In [85]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

def evaluate_model(model, tokenizer, dataset, n=5):
    for i in range(n):
        article = dataset[i]["article"]
        reference = dataset[i]["abstract"]
        generated = generate_summary(model, article, tokenizer)

        scores = scorer.score(reference, generated)

        print(f"\nSample {i+1}")
        print(f" Reference: {reference[:150]}...")
        print(f" Generated: {generated}")
        print(f" ROUGE-1: {scores['rouge1'].fmeasure:.4f} | ROUGE-L: {scores['rougeL'].fmeasure:.4f}")
        print("-" * 80)


In [92]:
evaluate_model(model, tokenizer, raw_dataset, n=5)



Sample 1
🔹 Reference:  additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for addi...
🔸 Generated:  because of major technology for regularized
📈 ROUGE-1: 0.0444 | ROUGE-L: 0.0296
--------------------------------------------------------------------------------

Sample 2
🔹 Reference:  we have studied the leptonic decay @xmath0 , via the decay channel @xmath1 , using a sample of tagged @xmath2 decays collected near the @xmath3 peak ...
🔸 Generated:  we like events which includes @xmath1 , where the @
📈 ROUGE-1: 0.1270 | ROUGE-L: 0.1270
--------------------------------------------------------------------------------

Sample 3
🔹 Reference:  in 84 , 258 ( 2000 ) , mateos conjectured that current reversal in a classical deterministic ratchet is associated with bifurcations from chaotic to ...
🔸 Generated:  at describing the signal comes from chaotic to moderate up to periodic regimes . 
 this is . comp