### Summarization and Q&A Using Transformer

- we need encoder-decoder architecture
- Helpful Resources
	•	The Annotated Transformer (PyTorch):
	•	https://nlp.seas.harvard.edu/2018/04/03/attention.html


### TODO:
- collect data:
    - collect all the ids
    - use https://arxiv.org/pdf/{id} to download pdf
    - read and save all the pdf articles in to csv file.

In [87]:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax
import copy
import math
import os
import pandas as pd
import time
import numpy as np

### Data Preparation

In [None]:
# Download latest version
# path = kagglehub.dataset_download("Cornell-University/arxiv")
# files = os.listdir(path)

In [5]:
# df = pd.read_json(path+'/arxiv-metadata-oai-snapshot.json', lines=True)
# print("First 5 records:", df.head())

### Model Architecture


In [2]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    
    # what are these masks for why do we need two masks?
    def forward(self, src, tgt, src_mask, tgt_mask):
        # take in and process maksed src and tgt sequences
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [3]:
class Generator(nn.Module):
    '''
    Standard linear + softmax generation step
    '''
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    
    def forward(self, x):
        # TODO: which activation function is the best for summarization?
        return log_softmax(self.proj(x), dim=-1)

In [4]:
def clones(module, N):
    # Produce N identical layers
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [7]:
class LayerNorm(nn.Module):
    '''Construct a layernorm module (See citation for details).'''
    # TODO: what is the purpose of eps?
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [8]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        'pass the input (and mask) through each layer in turn'
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


In [9]:
# TODO: what is this function trying to do? what is the input and output?
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        # why add X to the tensor?
        return x + self.dropout(sublayer(self.norm(x)))

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)


In [11]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
    
    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [13]:
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)

    return subsequent_mask == 0

In [14]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = torch.softmax(scores, dim=-1)

    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [15]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout = 0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        query, key, value = [
            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for l, x in zip(self.linears, (query, key, value))
        ]

        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)

        x = (x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k))

        del query
        del key
        del value

        return self.linears[-1](x)


In [16]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [17]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [19]:
def make_model(src_vocab, tgt_vocab, N=6, d_model= 512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab)
    )
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return model

### Testing the model

In [20]:
def inference_test():
    test_model = make_model(11, 11, 2)
    test_model.eval()
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    src_mask = torch.ones(1, 1, 10)

    memory = test_model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src.data)

    for i in range(9):
        out = test_model.decode(memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data))
        prob = test_model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    
    print("Example Untrained Model Prediction:", ys)

In [None]:
for _ in range (10):
    inference_test()

Example Untrained Model Prediction: tensor([[0, 5, 3, 3, 3, 3, 3, 3, 3, 3]])
Example Untrained Model Prediction: tensor([[0, 5, 5, 5, 5, 5, 5, 5, 5, 5]])
Example Untrained Model Prediction: tensor([[0, 1, 5, 7, 5, 7, 5, 7, 5, 7]])
Example Untrained Model Prediction: tensor([[0, 1, 7, 4, 3, 0, 1, 7, 2, 5]])
Example Untrained Model Prediction: tensor([[0, 8, 6, 8, 6, 8, 6, 8, 6, 8]])
Example Untrained Model Prediction: tensor([[0, 3, 8, 2, 9, 2, 9, 2, 9, 2]])
Example Untrained Model Prediction: tensor([[0, 2, 6, 2, 6, 2, 6, 2, 6, 2]])
Example Untrained Model Prediction: tensor([[ 0,  6,  6,  6,  6,  6, 10,  6,  6,  6]])
Example Untrained Model Prediction: tensor([[0, 8, 4, 5, 5, 5, 1, 1, 1, 1]])
Example Untrained Model Prediction: tensor([[0, 6, 8, 4, 4, 3, 1, 7, 1, 6]])


# Training

### Data preparation

In [None]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

In [38]:
# Split data to train(0.8) & test(0.2)
billsum_split = billsum.train_test_split(test_size=0.2)
print('sample', billsum_split["train"][0])
print('training dataset size:', len(billsum_split['train']))
print('testing dataset size:', len(billsum_split['test']))

sample {'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 35554 of the Vehicle Code, as amended by Section 2 of Chapter 263 of the Statutes of 2014, is amended to read:\n35554.\n(a) (1) Notwithstanding Section 35550, the maximum gross weight on any one axle of a bus shall not exceed 20,500 pounds.\n(2) This subdivision does not apply to a transit bus procured through a solicitation process pursuant to which a solicitation was issued before January 1, 2016. This subdivision does not apply to a bus purchased during an option period in a multiyear contract to purchase transit buses that is entered into before January 1, 2016, by a publicly owned or operated transit system, or an operator of a transit system under contract with a publicly owned or operated transit system, provided, however, that the option period does not exceed five years from the date of the original contract, or extend beyond January 1, 2021, whichever is earlier.\n(b) A transi

### data tokenization

In [None]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [41]:
tokenized_billsum = billsum_split.map(preprocess_function, batched=True)

Map: 100%|██████████| 989/989 [00:02<00:00, 415.65 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 439.06 examples/s]


In [45]:
print('', tokenized_billsum.shape)
print('', tokenized_billsum['test'][0].keys())

 {'train': (989, 6), 'test': (248, 6)}
 dict_keys(['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'])


In [48]:
print('', tokenized_billsum['test'][0]['input_ids'])
print('', tokenized_billsum['test'][0]['labels'])

 [21603, 10, 37, 151, 13, 8, 1015, 13, 1826, 103, 3, 35, 2708, 38, 6963, 10, 180, 3073, 9562, 1300, 7491, 209, 41, 287, 526, 4733, 28, 5568, 3, 15442, 11434, 61, 19, 974, 12, 8647, 314, 13, 2733, 209, 13, 6022, 3, 17864, 13, 8, 1685, 11, 6859, 3636, 6, 12, 608, 10, 7491, 1300, 23549, 5, 3, 15442, 11434, 5, 37, 28204, 12902, 11, 15884, 7, 66, 13, 8, 826, 10, 41, 9, 61, 94, 19, 2196, 24, 80, 16, 2391, 3165, 1826, 29, 7, 65, 8363, 6, 11, 8, 2302, 33, 6937, 7313, 5, 37, 1805, 381, 13, 273, 3, 2544, 1342, 33, 4161, 57, 8363, 19, 7752, 11, 5024, 12, 36, 231, 1146, 116, 2945, 53, 16, 8, 20588, 13, 686, 209, 8363, 11, 73, 25930, 18716, 26, 7671, 257, 138, 8363, 5, 41, 115, 61, 1826, 65, 8, 4016, 381, 13, 2041, 126, 1488, 13, 8363, 16, 8, 907, 1323, 5, 41, 75, 61, 37, 20588, 13, 8363, 859, 7, 17, 66, 1826, 29, 7, 65, 1936, 3538, 1093, 147, 8, 657, 5112, 5, 41, 26, 61, 2035, 209, 14912, 770, 151, 16, 1826, 43, 4880, 18999, 346, 1422, 6, 3, 9, 1706, 24, 19, 3, 9, 30073, 12, 423, 3, 26558, 686, 20

In [69]:
src_vocab = [item['text'].split() for item in tokenized_billsum['train']]
flattened_src = set([item for sublist in src_vocab for item in sublist])
print(len(flattened_src))

target_vocab = [item['text'].split() for item in tokenized_billsum['test']]
flattened_target = set([item for sublist in target_vocab for item in sublist])
print(len(flattened_target))


48464
23619


In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_billsum['train'][0]['input_ids'])
# tokens = tokenizer.convert_ids_to_tokens([0])
# Join tokens without any separator
joined_text = ''.join(tokens)
# Replace the marker with a space and strip any leading/trailing spaces
final_text = joined_text.replace('▁', ' ').strip()
print(final_text)

<pad>


In [None]:
# Batching the data
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

input_tokens_src = [input_ids['input_ids'] for input_ids in tokenized_billsum['train']]
input_tokens_target = [label['labels'] for label in tokenized_billsum['train']]
print(len(input_tokens_src))
print(len(input_tokens_target))

# Suppose input_tokens_src and input_tokens_target are lists of lists of token IDs.
# For example:
# input_tokens_src = [[1, 2, 3], [4, 5]]
# input_tokens_target = [[6, 7, 8], [9, 10]]

# Convert each list into a tensor
src_tensors = [torch.tensor(seq, dtype=torch.long) for seq in input_tokens_src]
target_tensors = [torch.tensor(seq, dtype=torch.long) for seq in input_tokens_target]

# Pad the sequences (e.g., using padding_value=0, and setting batch_first=True)
padded_src = pad_sequence(src_tensors, batch_first=True, padding_value=0)
padded_target = pad_sequence(target_tensors, batch_first=True, padding_value=0)

# Create a TensorDataset using the padded tensors
train_dataset = TensorDataset(padded_src, padded_target)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
 

989
989


### Training the system

In [93]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Model
src_vocab_size = len(flattened_src)
target_vocab_size = len(flattened_target)
model = make_model(src_vocab_size, target_vocab_size, 1).to("mps" if torch.backends.mps.is_available() else "cpu")

# Loss & Optimizer
loss_fn = nn.NLLLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
epochs = 10

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    for src, tgt, src_mask, tgt_mask in tqdm(train_loader):
        src, tgt, src_mask, tgt_mask = src.to(device), tgt.to(device), src_mask.to(device), tgt_mask.to(device)

        # Shift target for decoder input/output
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_len = tgt_input.size(1)
        tgt_mask = subsequent_mask(tgt_len).to(device)      # shape: [1, tgt_len, tgt_len]
        tgt_mask = tgt_mask.unsqueeze(1)

        out = model(src, tgt_input, src_mask, tgt_mask)
        logits = model.generator(out)

        logits = logits.view(-1, logits.size(-1))
        tgt_output = tgt_output.contiguous().view(-1)

        loss = loss_fn(logits, tgt_output)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")


Epoch 1/10


  0%|          | 0/31 [00:00<?, ?it/s]


ValueError: not enough values to unpack (expected 4, got 2)