In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import math


In [25]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output


In [26]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


In [27]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


In [28]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x


In [29]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones((seq_length, seq_length), device=tgt.device)).unsqueeze(0).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output


In [30]:
data = pd.read_csv("/content/drive/MyDrive/samsum-train.csv")
dialogues = [str(d) for d in data['dialogue'].tolist()]
summaries = [str(s) for s in data['summary'].tolist()]


In [31]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_SRC_LEN = 128
MAX_TGT_LEN = 64

def tokenize_and_pad(texts, max_length, tokenizer):
    return tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

input_encodings = tokenize_and_pad(dialogues, MAX_SRC_LEN, tokenizer)
target_encodings = tokenize_and_pad(summaries, MAX_TGT_LEN, tokenizer)


In [32]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_encodings["input_ids"], target_encodings["input_ids"], test_size=0.1
)

class TextSummarizationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input': self.inputs[idx],
            'label': self.labels[idx]
        }

train_dataset = TextSummarizationDataset(train_inputs, train_labels)
val_dataset = TextSummarizationDataset(val_inputs, val_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [33]:

src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 4096
dropout = 0.1

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [34]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    best_loss = float('inf')
    patience_counter = 0
    patience_threshold = 3

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            src = batch['input'].to(device)
            tgt = batch['label'].to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            optimizer.zero_grad()
            predictions = model(src, tgt_input)
            predictions = predictions.reshape(-1, predictions.size(-1))
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(predictions, tgt_output)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)


        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                src = batch['input'].to(device)
                tgt = batch['label'].to(device)
                tgt_input = tgt[:, :-1]
                tgt_output = tgt[:, 1:]

                predictions = model(src, tgt_input)
                predictions = predictions.reshape(-1, predictions.size(-1))
                tgt_output = tgt_output.reshape(-1)
                loss = criterion(predictions, tgt_output)

                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_loss:
          best_loss = avg_val_loss
          torch.save(model.state_dict(), "best_transformer_model.pth")
        else:
            patience_counter += 1
            if patience_counter >= patience_threshold:
                print("Early stopping")
                break

    print("Training complete.")

In [35]:
def generate_summary(model, src, tokenizer, max_len, device):
    model.eval()
    src = src.to(device)
    src_mask = (src != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
    src_embedded = model.dropout(model.positional_encoding(model.encoder_embedding(src)))

    with torch.no_grad():
        enc_output = src_embedded
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

    tgt_tokens = torch.tensor([[tokenizer.cls_token_id]], device=device)
    for _ in range(max_len):
        tgt_mask = torch.tril(torch.ones((tgt_tokens.size(1), tgt_tokens.size(1)), device=device)).bool().unsqueeze(0)
        tgt_embedded = model.dropout(model.positional_encoding(model.decoder_embedding(tgt_tokens)))
        dec_output = tgt_embedded

        for dec_layer in model.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        predictions = model.fc(dec_output[:, -1, :])
        next_token = predictions.argmax(dim=-1).unsqueeze(0)


        if next_token.item() == tokenizer.sep_token_id:
            break

        tgt_tokens = torch.cat([tgt_tokens, next_token], dim=1)

    return tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)


In [36]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=68691d78a8d2b13554165ef40fae46846ea0561bda12456209e31fe34efe983f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [37]:
from rouge_score import rouge_scorer

def evaluate_model(model, data_loader, tokenizer, max_len, device):
    model.to(device)
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    with torch.no_grad():
        for batch in data_loader:
            src = batch['input'].to(device)
            tgt = batch['label'].to(device)
            for i in range(src.size(0)):
                src_sentence = tokenizer.decode(src[i].tolist(), skip_special_tokens=True)
                tgt_sentence = tokenizer.decode(tgt[i].tolist(), skip_special_tokens=True)
                generated_summary = generate_summary(model, src[i].unsqueeze(0), tokenizer, max_len, device)
                scores = scorer.score(tgt_sentence, generated_summary)
                for key in rouge_scores:
                    rouge_scores[key].append(scores[key].fmeasure)

    avg_scores = {key: sum(values) / len(values) for key, values in rouge_scores.items()}
    return avg_scores


In [38]:
train_model(
    model, train_loader, val_loader, criterion, optimizer,
    num_epochs=60, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)


Epoch 1/60, Train Loss: 6.2285, Val Loss: 5.3040
Epoch 2/60, Train Loss: 5.1003, Val Loss: 4.9770
Epoch 3/60, Train Loss: 4.7997, Val Loss: 4.7986
Epoch 4/60, Train Loss: 4.5873, Val Loss: 4.6843
Epoch 5/60, Train Loss: 4.3777, Val Loss: 4.5007
Epoch 6/60, Train Loss: 4.1128, Val Loss: 4.3349
Epoch 7/60, Train Loss: 3.8636, Val Loss: 4.2128
Epoch 8/60, Train Loss: 3.6263, Val Loss: 4.1178
Epoch 9/60, Train Loss: 3.4092, Val Loss: 4.0555
Epoch 10/60, Train Loss: 3.2043, Val Loss: 4.0066
Epoch 11/60, Train Loss: 3.0040, Val Loss: 3.9875
Epoch 12/60, Train Loss: 2.8124, Val Loss: 4.0029
Epoch 13/60, Train Loss: 2.6222, Val Loss: 4.0128
Epoch 14/60, Train Loss: 2.4361, Val Loss: 4.0290
Early stopping
Training complete.


In [39]:
avg_scores = evaluate_model(
    model, val_loader, tokenizer, max_len=MAX_TGT_LEN,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)
print(avg_scores)


{'rouge1': 0.306224923270959, 'rouge2': 0.08354588017017366, 'rougeL': 0.24818029458176016}


In [40]:
import os

checkpoint_path = "/content/best_transformer_model.pth"
assert os.path.exists(checkpoint_path), "Checkpoint file does not exist!"

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
state_dict = torch.load(checkpoint_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
model.load_state_dict(state_dict)
model.eval()

  state_dict = torch.load(checkpoint_path, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))


Transformer(
  (encoder_embedding): Embedding(30522, 512)
  (decoder_embedding): Embedding(30522, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(

In [50]:
def inference(model, input_text, tokenizer, max_input_len, max_output_len, device):

    input_encoding = tokenizer(
        input_text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    ).to(device)

    input_ids = input_encoding['input_ids']

    src_mask = (input_ids != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

    with torch.no_grad():
        src_embedded = model.dropout(model.positional_encoding(model.encoder_embedding(input_ids)))
        enc_output = src_embedded
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

    tgt_tokens = torch.tensor([[tokenizer.cls_token_id]], device=device)

    for _ in range(max_output_len):
        tgt_mask = torch.tril(torch.ones((tgt_tokens.size(1), tgt_tokens.size(1)), device=device)).bool().unsqueeze(0)
        tgt_embedded = model.dropout(model.positional_encoding(model.decoder_embedding(tgt_tokens)))
        dec_output = tgt_embedded

        for dec_layer in model.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        predictions = model.fc(dec_output[:, -1, :])
        next_token = predictions.argmax(dim=-1).unsqueeze(0)

        if next_token.item() == tokenizer.sep_token_id:
            break

        tgt_tokens = torch.cat([tgt_tokens, next_token], dim=1)

    summary = tokenizer.decode(tgt_tokens.squeeze().tolist(), skip_special_tokens=True)

    return summary


In [59]:
# Input text
input_text = """Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him ðŸ™‚
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye"""

# Generate summary
summary = inference(model=model,input_text=input_text,tokenizer=tokenizer,max_input_len= 500, max_output_len= 100,device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

print("Generated Summary:", summary)

Generated Summary: amanda is looking for her mother. amanda is not sure if she can't help her.


Generated Summary : amanda is looking for her mother. amanda is not sure if she can't help her.

Actual Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

### 1. **Loss**:
   - **Training Loss**: The training loss steadily decreases throughout the epochs, showing effective learning by the model.
   - **Validation Loss**: The validation loss decreases consistently until Epoch 11, after which it plateaus and slightly increases. Early stopping was appropriately used to prevent overfitting.

### 2. **ROUGE Scores**:
   - **ROUGE-1**: 0.3062
   - **ROUGE-2**: 0.0835
   - **ROUGE-L**: 0.2482
   These scores show a slight improvement over the earlier results, indicating better capture of key unigrams (ROUGE-1) and structural coherence (ROUGE-L). However, the low ROUGE-2 score suggests that capturing consecutive bigrams is still challenging.

### 3. **Evaluation Observations**:
   - **Relevance**: The model demonstrates a moderate ability to generate relevant summaries, as reflected by the improved ROUGE-1 and ROUGE-L scores.
   - **Coherence**: The logical structure and clarity of the summaries have marginally improved, but the summaries still might miss some detailed connections (low ROUGE-2).
   - **Conciseness**: The summaries remain concise and capture essential content, but some key details may still be missing.

These results indicate progress in the model's ability to generate summaries, with better recall and structural alignment compared to earlier attempts. Let me know if you need further details or insights!