<a href="https://colab.research.google.com/github/tannisthamaiti/AIWeekend-Project/blob/main/Transformer_chatbot_Question.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets torch transformers
!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -qq cornell_movie_dialogs_corpus.zip
!rm cornell_movie_dialogs_corpus.zip
!mkdir datasets
!mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
!mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets

In [None]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import math
import torch.nn.functional as F

## 1) Data Processing

In [None]:
# data processing
max_len = 25

def remove_punc(string):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in string:
        if char not in punctuations:
            no_punct = no_punct + char  # space is also a character
    return no_punct.lower()

In [None]:
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

## Check first 5 lines

In [None]:
line_iter = iter(lines)
lines_dic = {}

for _ in range(5):
    try:
        line = next(line_iter)
        print(line)

    except StopIteration:
        break

In [None]:
conv_iter = iter(conv)
conv_dic = {}

for _ in range(5):
    try:
        line = next(conv_iter)
        print(line)

    except StopIteration:
        break

In [None]:
# extract text
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

# generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []

        if i == len(ids) - 1:
            break
        print(lines_dic[ids[1]])
        first = remove_punc(lines_dic[ids[i]].strip())
        second = remove_punc(lines_dic[ids[i+1]].strip())
        qa_pairs.append(first.split()[:max_len])
        qa_pairs.append(second.split()[:max_len])
        pairs.append(qa_pairs)
# sample
print(pairs[20])

In [None]:
min_word_freq = 5

word_freq = Counter()
for pair in pairs:
    word_freq.update(pair[0])
    word_freq.update(pair[1])

min_word_freq: sets a threshold to ignore rare words (those appearing ≤ 5
times).

word_freq: a Counter() that keeps track of how often each word appears in questions and answers.

pair[0]: the question (list of words).

pair[1]: the reply (list of words).

update(...): adds to the count for each word.



##Create the Vocabulary (Word Map)
Filters out rare words.

word_map: assigns a unique integer ID to each word, starting from 1.

Then, adds special tokens:

In [None]:
words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v + 1 for v, k in enumerate(words)}

📌 Special tokens are useful for sequence models:

$<unk>$ for unknown words

$<start>$ & $<end>$ for indicating sequence boundaries

$<pad>$ ensures sequences are same length (batching)

In [None]:
word_map['<unk>'] = None
word_map['<start>'] = None
word_map['<end>'] = None
word_map['<pad>'] = None

print("Total words are: {}".format(len(word_map)))



🔹 3. Encoding Functions
Converts list of words → list of integers (based on word_map).

Question Encoder:
Replaces each word with its index from word_map.

Pads to fixed length max_len.

In [None]:
# encode sentences based on word map
def encode_question(words, word_map):
    enc_c = None
    return enc_c

Adds $<start>$ and $<end>$ tokens.

Pads to max_len.

💡 max_len it's the fixed length of sequences for the model.

In [None]:
def encode_reply(words, word_map):
    enc_c = None
    return enc_c

Applies the encoders to all question-answer pairs.

Final result: pairs_encoded is a list of [encoded_question, encoded_reply] pairs — ready for training.

In [None]:
pairs_encoded = []
for pair in pairs:
    qus = encode_question(pair[0], word_map)
    ans = encode_reply(pair[1], word_map)
    pairs_encoded.append([qus, ans])

In [None]:
# dataset and dataloader
class Dataset(Dataset):

    def __init__(self, pairs):

        self.pairs = pairs
        self.dataset_size = len(self.pairs)

    def __getitem__(self, i):
        question = torch.LongTensor(self.pairs[i][0])
        reply = torch.LongTensor(self.pairs[i][1])
        return question, reply

    def __len__(self):
        return self.dataset_size

train_loader = DataLoader(Dataset(pairs_encoded), batch_size=32, shuffle=True, pin_memory=True)
question, reply = next(iter(train_loader))
print("Question: ", question.size())
print("Answer: ", reply.size())

## 2) Masking

Mask all the pad tokens (value 0) in the batch to ensure the model does not treat padding as input.

Look-ahead Mask to mask the future tokens in a sequence. We also mask out pad tokens. i.e. To predict the third word, only the first and second word will be used

##For training a Transformer, we need to mask:

Padding tokens (so the model ignores them)

Future tokens in the decoder input (so it can't "cheat" and see the next word)

Target tokens for calculating loss

In [None]:
mask_test = torch.triu(torch.ones(5, 5)).transpose(0, 1).type(dtype=torch.uint8)
print(mask_test.unsqueeze(0))

So the decoder can only attend to previous and current tokens, not future ones.

2. ✅ question_mask

question != 0: masks out <pad> tokens (index 0)

Unsqueezes shape to (batch_size, 1, 1, seq_len) for broadcasting in multi-head attention

3. ✅ reply_input_mask

Removes padding and applies the triangular future mask

Final shape: (batch_size, 1, seq_len, seq_len)

This ensures the decoder can only attend to valid previous words.

4. ✅ reply_target_mask

Used during training for loss calculation

Shape: (batch_size, seq_len)

Marks valid tokens (ignores <pad>)



In [None]:
# create mask
def create_masks(question, reply_input, reply_target, device='cpu'):

    def subsequent_mask(size):
        # (max_words, max_words)
        # binary triangle
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        # (1, max_words, max_words)
        return mask.unsqueeze(0)

    # boolean(m, max_words)
    question_mask = None

    # (m, 1, 1, max_words)
    question_mask = question_mask.to(device)
    question_mask = None
    # boolean(m, max_words)
    reply_input_mask = None
    # (m, 1, max_words)
    reply_input_mask = None

    # only include triangle and non-pad token
    # (m, max_words, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data)
    # (batch_size, max_words)
    reply_target_mask = reply_target != 0

    return question_mask, reply_input_mask, reply_target_mask

This is typical in sequence modeling where input is like:


**Input to decoder**:      $<start>$ how are
<br>
**Target to predict**:     how are you $<end>$

In [None]:
reply_input = reply[:, :-1]
reply_target = reply[:, 1:]
print('Reply Target Size: ', reply_target.size())

# Create mask and add dimensions
question_mask, reply_input_mask, reply_target_mask = create_masks(question, reply_input, reply_target)
print('question_mask Size: ', question_mask.size())
print('reply_input_mask Size: ', reply_input_mask.size())
print('reply_target_mask Size: ', reply_target_mask.size())

```
Reply Target Size:  torch.Size([32, 26])
question_mask Size:  torch.Size([32, 1, 1, 25])
reply_input_mask Size:  torch.Size([32, 1, 26, 26])
reply_target_mask Size:  torch.Size([32, 26])
```

## 3) Positional Embedding
3.1 Positional Embedding
Since this model doesn't contain any recurrence or convolution, positional encoding is added to give the model some information about the relative position of the words in the sentence.

The positional encoding vector is added to the embedding vector. Embeddings represent a token in a d-dimensional space where tokens with similar meaning will be closer to each other.

Implement based on [this nb](https://https://github.com/tannisthamaiti/AIWeekend-Project/blob/main/Positional_Encoding.ipynb)

##🔁 forward(self, embedding, layer_idx)
embedding: the input tensor, either word indices (if layer_idx == 0) or already embedded vectors.

layer_idx: the current layer number in the Transformer (used for layer-wise encoding).

✅ 1. Embedding the input if it's the first layer

Applies word embedding only once, at the first layer.

Scales the embedding vector (standard in Transformer) to stabilize training.

✅ 2. Add positional encoding

self.pe: shape (1, max_len, d_model)

Adds position information (word position in the sequence).

Automatically broadcasts across the batch dimension.

✅ 3. Add layer encoding (temporal embedding per layer)

self.te: shape (1, num_layers, d_model)

This allows the model to differentiate between layers (like a form of layer-specific bias).

Repeats the layer encoding across the sequence length dimension to match shape.

✅ 4. Dropout

Adds regularization to prevent overfitting.

In [None]:
# Positional Embedding
class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings.
    """
    def __init__(self, vocab_size, d_model, max_len=50, num_layers=6):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = None
        self.embed = None
        # (1, max_len, d_model)
        self.pe = self.create_positional_encoding(None, None)
        # (1, num_layers, d_model)
        self.te = self.create_positional_encoding(None, None)
        self.dropout = nn.Dropout(0.1)

    def create_positional_encoding(self, max_len, d_model, device='cpu'):
        pe = torch.zeros(max_len, d_model).to(device)
        # for each position of the word
        for pos in range(max_len):
            # for each dimension of the each position
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        # (1, max_len, d_model)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, embedding, layer_idx):
        # create embed weight during first layer
        # (m, max_len) --> (m, max_len, d_model)
        if layer_idx == 0:
            # scaling helps in stabilizing and improving the convergence properties
            embedding = None

        ### Positional Embedding
        # pe will automatically be expanded with the same batch size as encoded_words
        # (m, max_len, d_model)
        embedding += None

        # te: (1, num_layers, d_model) --> (1, 1, d_model) --> (1, max_len, d_model)
        # (m, max_len, d_model)
        embedding += None
        embedding = None
        return embedding

In [None]:
embed_model = Embeddings(len(word_map), 512)
result = embed_model.forward(question, 0)
print(result.size())

```
torch.Size([32, 25, 512])
```