In [1]:
import torch
import numpy as np
from gensim.models import KeyedVectors
from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.nn import LayerNorm,attention,Embedding
import math
import nltk
import re

In [2]:
# def load_pairs_from_file(file_path):
#     pairs = []
#     with open(file_path, 'r', encoding='utf-8') as f:
#         lines = f.readlines()
#         for i in range(0, len(lines), 3):  # every 3 lines: input, response, blank
#             if i + 1 < len(lines):
#                 input_line = lines[i].strip().replace("Input: ", "")
#                 response_line = lines[i + 1].strip().replace("Response: ", "")
#                 pairs.append((input_line, response_line))
#     return pairs

# def load_pairs_from_file(file_path):
#     with open(file_path, 'r', encoding='utf-8') as f:
#         lines = f.read()
            
#     return lines

def load_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def split_into_pairs(text):
    utterances = [utt.strip() for utt in text.split('__eou__') if utt.strip()]
    pairs = [(utterances[i], utterances[i + 1]) for i in range(len(utterances) - 1)]
    return pairs

# Load datasets
train_pairs = split_into_pairs(load_text_from_file("dataset/dialogues_train.txt"))
val_pairs   = split_into_pairs(load_text_from_file("dataset/dialogues_validation.txt"))
test_pairs  = split_into_pairs(load_text_from_file("dataset/dialogues_test.txt"))

print(f"Train: {len(train_pairs)} | Val: {len(val_pairs)} | Test: {len(test_pairs)}")


Train: 87169 | Val: 8068 | Test: 7739


In [3]:
print(train_pairs[:10])

[('Say , Jim , how about going for a few beers after dinner ?', 'You know that is tempting but is really not good for our fitness .'), ('You know that is tempting but is really not good for our fitness .', 'What do you mean ? It will help us to relax .'), ('What do you mean ? It will help us to relax .', "Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?"), ("Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?", "I guess you are right.But what shall we do ? I don't feel like sitting at home ."), ("I guess you are right.But what shall we do ? I don't feel like sitting at home .", 'I suggest a walk over to the gym where we can play singsong and meet some of our friends .'), ('I suggest a walk over to the gym where we can play singsong and meet some of our friends .', "That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them ."), ("That's a

## Build Vocab

In [None]:
SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>', '<unk>']

def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if not token.isdigit()]
    return tokens 


def build_vocab (pairs,min_frequency = 5):
    counter  = Counter()
    for input, resp in pairs:
        counter.update(tokenize(input))
        counter.update(tokenize(resp))
    vocab_words = [word for word, freq in counter.items() if freq >= min_frequency]
    vocab = SPECIAL_TOKENS + sorted(vocab_words)
    
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

In [7]:
# SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>', '<unk>']
# def preprocess_text(text):
#     text = text.lower()
#     text = re.sub(r"[^a-z0-9\s]", "", text)
#     tokens = nltk.word_tokenize(text)
#     return tokens 


# def build_vocab(text):
#     token = preprocess_text(text)
#     vocab = SPECIAL_TOKENS + sorted(token)
#     word2idx = {word: idx for idx, word in enumerate(vocab)}
#     idx2word = {idx: word for word, idx in word2idx.items()}
#     return word2idx, idx2word

In [None]:
word2idx, idx2word = build_vocab(train_pairs, min_frequency=5)
print(f"Vocab size: {len(word2idx)}")

Vocab size: 21514


In [54]:
print(word2idx)



## Load Fasttext 

In [9]:
fasttext_model = KeyedVectors.load_word2vec_format('fasttext file/cc.en.300.vec', binary=False)
# https://fasttext.cc/docs/en/crawl-vectors.html choose English choose text .vec file

In [10]:
embedding_dim = 300
embedding_matrix = np.zeros((len(word2idx),embedding_dim))

for words,idx in word2idx.items():
    if words in fasttext_model:
        embedding_matrix[idx] = fasttext_model[words]
    else:
        embedding_matrix[idx] = np.random.normal(scale= 0.6, size = (embedding_dim) )

fasttext_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)

## Encoding

In [11]:
def encoded_sentences(sentence, word2idx):
    tokens = tokenize(sentence)
    return [word2idx.get(token, word2idx['<unk>']) for token in tokens]

def encode_pairs(pairs, word2idx):
    encoded = []
    for input_text, resp_text in pairs:
        inp_ids = encoded_sentences(input_text, word2idx)
        res_ids = [word2idx['<sos>']] + encoded_sentences(resp_text, word2idx) + [word2idx['<eos>']]
        encoded.append((inp_ids, res_ids))
    return encoded

def decode_ids(ids, idx2word):
    return [idx2word.get(i, '<unk>') for i in ids]

In [12]:
encoded_train = encode_pairs(train_pairs, word2idx)
encoded_val   = encode_pairs(val_pairs, word2idx)
encoded_test  = encode_pairs(test_pairs, word2idx)

print("Input:", encoded_train[0][0])
print("Response:", encoded_train[0][1])
print("Input tokens:", decode_ids(encoded_train[0][0], idx2word))
print("Response tokens:", decode_ids(encoded_train[0][1], idx2word))


Input: [16368, 10321, 9403, 929, 8342, 7770, 899, 7400, 2361, 1201, 5738]
Response: [1, 21374, 10627, 18938, 10101, 18825, 3188, 10101, 15270, 12919, 8357, 7770, 13428, 7549, 2]
Input tokens: ['say', 'jim', 'how', 'about', 'going', 'for', 'a', 'few', 'beers', 'after', 'dinner']
Response tokens: ['<sos>', 'you', 'know', 'that', 'is', 'tempting', 'but', 'is', 'really', 'not', 'good', 'for', 'our', 'fitness', '<eos>']


##Custom Dataset

In [13]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class ChatDataset(Dataset):
    def __init__(self, pairs):
        """
        pairs: list of (input_ids, response_ids), where
        - input_ids: tokenized input sentence
        - response_ids: [<sos>] + tokenized response + [<eos>]
        """
        # Filter out too-short responses (less than <sos> + word + <eos>)
        self.pairs = [pair for pair in pairs if len(pair[1]) >= 3]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]  # returns (input_ids, response_ids)


def collate_fn(batch):
    input_seqs, target_seqs = zip(*batch)

    # Convert to tensors
    input_seqs = [torch.tensor(seq, dtype=torch.long) for seq in input_seqs]
    tgt_inputs = [torch.tensor(seq[:-1], dtype=torch.long) for seq in target_seqs]  # decoder input (no <eos>)
    tgt_outputs = [torch.tensor(seq[1:], dtype=torch.long) for seq in target_seqs]  # decoder target (no <sos>)

    # Pad sequences
    input_padded = pad_sequence(input_seqs, batch_first=True, padding_value=word2idx['<pad>'])
    tgt_input_padded = pad_sequence(tgt_inputs, batch_first=True, padding_value=word2idx['<pad>'])
    tgt_output_padded = pad_sequence(tgt_outputs, batch_first=True, padding_value=word2idx['<pad>'])

    return input_padded, tgt_input_padded, tgt_output_padded


In [14]:
from torch.utils.data import DataLoader

train_dataset = ChatDataset(encoded_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Inspect one batch
for src, tgt_inp, tgt_out in train_loader:
    print("Encoder input:", src.shape)       # [batch_size, src_seq_len]
    print("Decoder input:", tgt_inp.shape)   # [batch_size, tgt_seq_len]
    print("Decoder target:", tgt_out.shape)  # [batch_size, tgt_seq_len]
    break


Encoder input: torch.Size([32, 42])
Decoder input: torch.Size([32, 41])
Decoder target: torch.Size([32, 41])


In [15]:
batch_size = 16

train_dataset = ChatDataset(encoded_train)
val_dataset   = ChatDataset(encoded_val)
test_dataset  = ChatDataset(encoded_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

## Model Building

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


In [19]:
class TransformerChatbot(nn.Module):
    def __init__(self, fasttext_embeddings ,vocab_size,pad_idx ,emb_dim = 300, nhead = 6, num_layers = 3, dim_ff = 512, dropout =0.2):
        super().__init__()
        self.embedding_layer = nn.Embedding.from_pretrained(fasttext_embeddings, freeze=False)
        self.pos_encoder = PositionalEncoding(emb_dim)

        self.transformer = nn.Transformer(
            d_model=emb_dim,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(emb_dim, vocab_size)
        self.pad_idx = pad_idx

    def forward(self, src, tgt):
        # Padding masks
        src_key_padding_mask = (src == self.pad_idx)
        tgt_key_padding_mask = (tgt == self.pad_idx)

        # Causal mask for decoder
        tgt_seq_len = tgt.size(1)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(src.device)

        # Embedding + Positional Encoding
        src_emb = self.pos_encoder(self.embedding_layer(src))
        tgt_emb = self.pos_encoder(self.embedding_layer(tgt))

        # Transformer
        output = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            tgt_mask=tgt_mask
        )

        return self.fc_out(output)  # shape [batch, tgt_len, vocab_size]


## Training Loop

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = TransformerChatbot(
    vocab_size=len(word2idx),
    emb_dim=300,
    nhead=4,
    num_layers=4,
    dim_ff=256,
    dropout=0.1,
    pad_idx=word2idx['<pad>'],
    fasttext_embeddings=fasttext_embeddings
).to(device)


In [21]:
model

TransformerChatbot(
  (embedding_layer): Embedding(21514, 300)
  (pos_encoder): PositionalEncoding()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
          )
          (linear1): Linear(in_features=300, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=256, out_features=300, bias=True)
          (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
 

In [22]:
pad_idx = word2idx['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(),lr = 3e-4)

In [23]:
def train (model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for src, tgt_inp, tgt_out in loader:
        src, tgt_inp, tgt_out = src.to(device), tgt_inp.to(device), tgt_out.to(device)

        optimizer.zero_grad()
        output = model(src, tgt_inp)  # [B, T, V]
        loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    for src, tgt_inp, tgt_out in loader:
        src, tgt_inp, tgt_out = src.to(device), tgt_inp.to(device), tgt_out.to(device)
        output = model(src, tgt_inp)
        loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))
        total_loss += loss.item()
    return total_loss / len(loader)

@torch.no_grad()
def test(model, loader, criterion):
    model.eval()
    total_loss = 0
    for src, tgt_inp, tgt_out in loader:
        src, tgt_inp, tgt_out = src.to(device), tgt_inp.to(device), tgt_out.to(device)
        output = model(src, tgt_inp)
        loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))
        total_loss += loss.item()
    return total_loss / len(loader)



In [24]:
epochs = 1

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")




: 

In [None]:
test_loss = test(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}")


In [None]:
def generate_response(model, input_sentence, word2idx, idx2word, max_len=50):
    model.eval()

    # 1. Tokenize and numericalize input
    tokens = input_sentence.strip().split()  # Or use your tokenizer
    input_ids = [word2idx.get(tok, word2idx['<unk>']) for tok in tokens]
    src = torch.tensor([input_ids], dtype=torch.long).to(device)

    # 2. Start with <sos> token for decoder input
    generated_ids = [word2idx['<sos>']]
    for _ in range(max_len):
        tgt = torch.tensor([generated_ids], dtype=torch.long).to(device)

        # Forward pass
        with torch.no_grad():
            output = model(src, tgt)  # [1, T, V]
            next_token_logits = output[0, -1, :]  # last token's logits
            next_token_id = torch.argmax(next_token_logits).item()

        # Stop if <eos>
        if next_token_id == word2idx['<eos>']:
            break

        generated_ids.append(next_token_id)

    # 3. Decode token ids to words
    generated_words = [idx2word[idx] for idx in generated_ids[1:]]  # remove <sos>
    return ' '.join(generated_words)


In [None]:
while True:
    user_input = input("Input: ")
    if user_input.lower() in ['exit', 'quit']:
        break
    response = generate_response(model, user_input, word2idx, idx2word)
    print("Response:", response)


In [None]:
torch.save(model.state_dict(), 'transformer_chatbot.pt')