In [None]:
corpus_movie_conv = r'/content/movie_conversations.txt'
corpus_movie_lines = r'/content/movie_lines.txt'

In [None]:
!pip install keras-nlp

In [None]:
from keras_nlp.layers import TokenAndPositionEmbedding

In [None]:
import tensorflow

tensorflow.__version__

'2.17.0'

In [None]:
!python --version

Python 3.10.12


In [None]:
with open(corpus_movie_conv, 'r') as f:
    conversation = f.readlines()

In [None]:
with open(corpus_movie_lines, 'r',encoding='latin-1') as c:
    dialogues = c.readlines()

In [None]:
conversation[0].split(' +++$+++ ')

['u0', 'u2', 'm0', "['L194', 'L195', 'L196', 'L197']\n"]

In [None]:
dialogues[0].split(' +++$+++ ')[-1]

'They do not!\n'

In [None]:
lines_dict = {}

for lines in dialogues:
  lines = lines.split(' +++$+++ ')
  line_number = lines[0]
  line = lines[-1]
  lines_dict[line_number] = line

In [None]:
lines_dict

In [None]:
def remove_punctuations(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  without_punc = ""
  for char in text:
    if char not in punctuations:
      without_punc = without_punc+char

  return without_punc.lower()

In [None]:
remove_punctuations('kdkwmoo@#$')

'kdkwmoo'

In [None]:
maxlen = 30

pairs = []

for conv in conversation:
  conv = eval(conv.split(' +++$+++ ')[-1])

  for i in range(len(conv)-1):
    temp = []
    line1 = remove_punctuations(lines_dict[conv[i]]).strip()
    line2 = remove_punctuations(lines_dict[conv[i+1]]).strip()
    temp.append(line1.split()[:maxlen])
    temp.append(line2.split()[:maxlen])
    pairs.append(temp)

In [None]:
pairs[0]

In [None]:
len(pairs)

221616

*Word 2 Index mapping*

In [None]:
from collections import Counter

In [None]:
# help(Counter)

In [None]:
counter = Counter()

for pair in pairs:
  counter.update(pair[0])
  counter.update(pair[1])

In [None]:
words = [word for word in counter.keys() if counter[word]>5]

# those words which occurs more than 5 times

In [None]:
words

In [None]:
word2indx = {}

for i,word in enumerate(words):
  word2indx[word] = i+1

In [None]:
word2indx['<UNKW>'] = len(word2indx) + 1
word2indx['<start>'] = len(word2indx) + 1
word2indx['<end>'] = len(word2indx) + 1
word2indx['<pad>'] = 0

*encoding and padding*

In [None]:
def encode_questions(words,word2indx):
  encoded = [word2indx.get(word,word2indx['<UNKW>']) for word in words] + [word2indx['<pad>']]*abs(maxlen-len(words))

  return encoded

In [None]:
def encode_reply(words,word2indx):
  encoded = [word2indx['<start>']] + [word2indx.get(word,word2indx['<UNKW>']) for word in words] + [word2indx['<end>']] + [word2indx['<pad>']]*abs(maxlen-len(words))

  return encoded

In [None]:
encode_reply(pairs[0][1],word2indx)

In [None]:
encoded_pairs = []

for pair in pairs:
  Q = encode_questions(pair[0],word2indx)
  R = encode_reply(pair[1],word2indx)
  encoded_pairs.append([Q,R])

In [None]:
import torch

In [None]:
from torch.utils.data import Dataset

In [None]:
class Dataset(Dataset):
  def __init__(self):
    self.pairs = encoded_pairs
    self.dataset_size = len(pairs)

  def __getitem__(self,index):
    question = torch.LongTensor(self.pairs[index][0])
    reply = torch.LongTensor(self.pairs[index][1])

    return question,reply

  def __len__(self):
    return self.dataset_size

***DataLoader***

*DataLoader automatically splits the dataset into smaller batches of data, which can be processed in parallel*

*DataLoader can load data in parallel using multiple worker threads (specified by the num_workers parameter). This speeds up the data loading process, especially when working with large datasets*

In [None]:
data_loader = torch.utils.data.DataLoader(Dataset(),batch_size=32,shuffle=True,num_workers=2,pin_memory=True)

*Masking*

In [None]:
torch.triu(torch.ones(4,4)).transpose(0,1).unsqueeze(0).shape

torch.Size([1, 4, 4])

In [None]:
def masking(question,reply_input,reply_target):
  # "subsequent mask" to ensure that when predicting a word in a sequence, the model only looks at the current and previous words, not future ones.
  def subsequent_mask(size):
    mask_matrix = torch.triu(torch.ones(size,size)).transpose(0,1).type(dtype=torch.uint8)

    return mask_matrix.unsqueeze(0)

    # this will create upper triangluar matrix
    # [1,0,0,0]
    # [1,1,0,0]
    # [1,1,1,0]
    # [1,1,1,1]

  # masking position of words as 1 and rest as 0
  question_mask = (question!=0)
  # adding dimension: original --> (batch_size,num_words)    after --> (batch_size,1,1,num_words)
  question_mask = question_mask.unsqueeze(1).unsqueeze(1)

  reply_input_mask = (reply_input!=0)
  reply_input_mask = reply_input_mask.unsqueeze(1)
  reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data)
  reply_input_mask = reply_input_mask.unsqueeze(1)

  reply_target_mask = reply_target!=0

  return question_mask,reply_input_mask,reply_target_mask

*Embeddings*

In [None]:
from torch import nn
import math

In [None]:
input_ = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])

nn.Embedding(12, 4)(input_)

tensor([[[-1.3097,  1.0914,  0.7815,  0.9753],
         [ 0.4986,  1.5327,  1.3669,  0.3893],
         [ 0.3557,  1.0900, -0.2915,  0.3765],
         [-2.4833, -1.4254,  0.2277,  1.2642]],

        [[ 0.3557,  1.0900, -0.2915,  0.3765],
         [ 0.2111,  0.1213, -0.8335,  1.9618],
         [ 0.4986,  1.5327,  1.3669,  0.3893],
         [-0.4673,  0.2228, -0.4245,  1.7784]]], grad_fn=<EmbeddingBackward0>)

In [None]:
m = torch.randn(2, 4)

nn.Linear(4,5)(m)

tensor([[-0.2552,  0.7170, -1.2658, -1.4811, -1.0806],
        [-0.4875, -0.3928, -0.4954,  0.4320,  0.6854]],
       grad_fn=<AddmmBackward0>)

In [None]:
# help(nn.Module)

In [None]:
class Embeddings(nn.Module):
  '''
  embeddings of words and adding positional encoding to them
  '''
  def __init__(self,vocab_size,dim_model,max_len=50):
    super(Embeddings,self).__init__()
    self.dim_model = dim_model
    self.dropout = nn.Dropout(0.1)
    self.embeddings = nn.Embedding(vocab_size,dim_model)
    self.pos_encoding = self.positional_encoding(max_len,dim_model)
    self.dropout = nn.Dropout(0.1)

  def positional_encoding(self,max_len,dim_model):
    pos_encoding = torch.zeros(max_len,dim_model)
    for pos in range(max_len):
      for i in range(0,dim_model,2):
        pos_encoding[pos,i] = math.sin(pos/10000**((2*i)/dim_model))
        pos_encoding[pos,i+1] = math.cos(pos/10000**((2*(i+1))/dim_model))

    pos_encoding = pos_encoding.unsqueeze(0)            # adding dimension for batch size

    return pos_encoding

  def forward(self,encoded_word):
    '''
    this function add embeddings and positional encodings of words
    '''
    word_embeddings = self.embeddings(encoded_word)*math.sqrt(self.dim_model)
    word_embeddings += self.pos_encoding[:,:word_embeddings.shape[1]]
    word_embeddings = self.dropout(word_embeddings)

    return word_embeddings

*Multihead Attention*

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,dim_model,heads):
    super(MultiHeadAttention,self).__init__()
    assert dim_model%heads == 0
    self.dim_head = dim_model//heads
    self.heads = heads
    self.dim_model = dim_model
    self.query = nn.Linear(dim_model,dim_model)
    self.key = nn.Linear(dim_model,dim_model)
    self.value = nn.Linear(dim_model,dim_model)
    self.concat = nn.Linear(dim_model,dim_model)               # this is to combine the attention scores of all the heads
    self.dropout = nn.Dropout(0.1)

  def forward(self,Q,K,V,mask):
    '''
    shape of query,key,value --> (batch_size,max_len,dim_model(=512))
    mask shape --> (batch_size,1,1,num_words)
    '''

    Q = self.query(Q)
    K = self.key(K)
    V = self.value(V)

    # (batch_size,max_len,512) --> (batch_size,max_len,head,each_head_dimension) --> (batch_size,heads,max_len,each_head_dimension)

    Q = Q.view(Q.shape[0],-1,self.heads,self.dim_head).permute(0,2,1,3)
    K = K.view(Q.shape[0],-1,self.heads,self.dim_head).permute(0,2,1,3)
    V = V.view(Q.shape[0],-1,self.heads,self.dim_head).permute(0,2,1,3)


    # calculating attention scores
    # score = softmax((Q.KT)/sqrt(dim_head))
    # weights = score.V

    # (batch_size,heads,max_len,each_head_dimension) . (batch_size,heads,each_head_dimension,maxlen) = (batch_size,heads,max_len,max_len)
    scores = torch.matmul(Q,K.permute(0,1,3,2))/math.sqrt(self.dim_head)
    scores = scores.masked_fill(mask==0,-1e9)     #this replaces zero value with -1e9 so that softmax computes very less score for them as 0 are not useful for model.
    scores = torch.softmax(scores,dim=-1)
    scores = self.dropout(scores)

    # (batch_size,heads,max_len,max_len) . (batch_size,heads,max_len,each_head_dimension) = (batch_size,heads,max_len,each_head_dimension)
    weights = torch.matmul(scores,V)

    # (batch_size,heads,max_len,each_head_dimension) --> (batch_size,max_len,heads,each_head_dimension) --> (batch_size,max_len,heads*each_head_dimension)
    weights = weights.permute(0,2,1,3).contiguous().view(weights.shape[0],-1,self.heads*self.dim_head)

    concated = self.concat(weights)

    return concated

*Feed Forward Layer*

In [None]:
class FeedForward(nn.Module):
  def __init__(self,dim_model,middle_dim=2048):
    super(FeedForward,self).__init__()
    self.dim_model = dim_model
    self.layer1 = nn.Linear(dim_model,middle_dim)
    self.layer2 = nn.Linear(middle_dim,dim_model)
    self.dropout = nn.Dropout(0.1)

  def forward(self,input_):
    x = torch.nn.functional.relu(self.layer1(input_))
    x = self.dropout(x)
    x = self.layer2(x)

    return x

*Encoder*

In [None]:
class Encoder(nn.Module):
  def __init__(self,vocab_size,max_len,dim_model,heads):
    super(Encoder,self).__init__()
    self.dim_model = dim_model
    self.self_attention = MultiHeadAttention(self.dim_model,heads)
    self.feed_forward = FeedForward(self.dim_model)
    self.dropout = nn.Dropout(0.1)

  def forward(self,embeddings,mask):
    self.layer_norm = nn.LayerNorm(self.dim_model)
    encoded = self.self_attention(embeddings,embeddings,embeddings,mask)
    encoded = self.layer_norm(encoded+embeddings)
    feedforward = self.dropout(encoded)
    feedforward = self.feed_forward(encoded)

    final_encoded = self.layer_norm(encoded+feedforward)

    return final_encoded

*Decoder*

In [None]:
class Decoder(nn.Module):
  def __init__(self,vocab_size,max_len,dim_model,heads):
    super(Decoder,self).__init__()
    self.dim_model = dim_model
    self.self_attention = MultiHeadAttention(self.dim_model,heads)
    self.source_attention = MultiHeadAttention(self.dim_model,heads)
    self.feed_forward = FeedForward(self.dim_model)
    self.dropout = nn.Dropout(0.1)

  def forward(self,encoded,embeddings,source_mask,target_mask):
    self.layer_norm = nn.LayerNorm(self.dim_model)
    decoded_1 = self.self_attention(embeddings,embeddings,embeddings,target_mask)
    decoded_1 = self.layer_norm(decoded_1+embeddings)

    decoded = self.source_attention(decoded_1,encoded,encoded,source_mask)
    decoded = self.layer_norm(decoded+decoded_1)

    feedforward = self.dropout(decoded)
    feedforward = self.feed_forward(decoded)

    final_decoded = self.layer_norm(feedforward+decoded)

    return final_decoded

*Transformer*

In [None]:
class Transformer(nn.Module):
  def __init__(self,dim_model,max_len,heads,num_layers,word2index):
    super(Transformer,self).__init__()
    self.dim_model = dim_model
    self.vocab_size = len(word2index)
    self.embeddings = Embeddings(self.vocab_size,self.dim_model)
    self.encoder = nn.ModuleList([Encoder(self.vocab_size,max_len,self.dim_model,heads) for _ in range(num_layers)])
    self.decoder = nn.ModuleList([Decoder(self.vocab_size,max_len,self.dim_model,heads) for _ in range(num_layers)])
    self.logits = nn.Linear(self.dim_model,self.vocab_size)

  def encode(self,question,question_mask):
    question_embedding = self.embeddings(question)
    for layer in self.encoder:
      encode_embedding = layer(question_embedding,question_mask)

    return encode_embedding

  def decode(self,reply_target,encode_embedding,question_mask,reply_target_mask):
    target_embedding = self.embeddings(reply_target)
    for layer in self.decoder:
      target_embedding = layer(encode_embedding,target_embedding,question_mask,reply_target_mask)

    return target_embedding

  def forward(self,question,question_mask,reply_target,reply_target_mask):
    encoded = self.encode(question,question_mask)
    decoded = self.decode(reply_target,encoded,question_mask,reply_target_mask)
    logits = self.logits(decoded)
    output = nn.functional.log_softmax(logits,dim=2)

    return logits

*Adam Warmup*

In [None]:
class AdamWarmup:
  def __init__(self,dim_model,warmup_steps,optimizer):
    self.dim_model = dim_model
    self.warmup_steps = warmup_steps
    self.current_step = 0
    self.learning_rate = 0
    self.optimizer = optimizer

  def get_rate(self):
    return (1/math.sqrt(self.dim_model))*min(1/math.sqrt(self.current_step),self.current_step*(1/math.sqrt(self.warmup_steps**3)))

  def step(self):
    self.current_step += 1
    rate = self.get_rate()
    for p in self.optimizer.param_groups:
      p['lr'] = rate

    self.learning_rate = rate
    self.optimizer.step()

*KL Divergence Loss*

In [None]:
class KL_Loss(nn.Module):
  def __init__(self,size_of_vectors,alpha):
    super(KL_Loss,self).__init__()
    self.loss = nn.KLDivLoss()
    self.size_of_vectors = size_of_vectors
    self.alpha = alpha

  def minimize_loss(self,predictions,target,target_mask):
    '''
    predictions --> (batch_size,max_words,vocab_size)
    target --> (batch_size,max_words)
    target_mask --> (batch_size,1,1,max_words)
    '''
    predictions = predictions.view(-1,predictions.shape[-1])       # (batch_size,max_words,vocab_size) --> (batch_size*max_words,vocab_size)
    target = target.contiguous().view(-1)                         # (batch_size,max_words) --> (batch_size*max_words)
    target_mask = target_mask.float()
    target_mask = target_mask.view(-1)                            # (batch_size*max_words)
    labels = predictions.data.clone()
    labels = labels.fill_(self.alpha/self.size_of_vectors-1)
    labels.scatter(1,target.data.unsqueeze(1),1-self.alpha)
    # we have to change values against column i.e., 1st dimension that's why 1 and accessing the
    # target index that's why we are using target but because scatter function takes second parameter as same size as labels and then at that
    # particular index we replace value as 1 - alpha.

    Loss = self.loss(predictions,labels)
    Loss = (Loss*target_mask).sum()/target_mask.sum()

    return Loss

*Initializing transformer ,training and evaluating*

In [None]:
import torch

In [None]:
model_dimension = 512
max_length = 50
heads = 8
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_layers = 3

transformer = Transformer(dim_model=model_dimension,max_len=max_length,heads=heads,num_layers=num_layers,word2index=word2indx)
# transformer = transformer
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(dim_model=model_dimension,warmup_steps=4000,optimizer=adam_optimizer)
kl_loss = KL_Loss(len(word2indx),0.2)

In [None]:
from tqdm.notebook import tqdm

def train(data_loader,transformer,optimizer,loss,epochs):
  transformer.train()
  sum_loss = 0
  count = 0

  for i,(question,reply) in enumerate(data_loader):
    samples = question.shape[0]

    # reply input is without the <end> token
    reply_input = reply[:,:-1]

    # without <start> token
    reply_target = reply[:,1:]

    question = question
    reply_input = reply_input
    reply_target = reply_target

    # initializing mask
    question_mask,input_mask,target_mask = masking(question,reply_input,reply_target)

    # forward propagation
    output = transformer(question,question_mask,reply_input,input_mask)

    # calculate loss
    Loss = loss.minimize_loss(output,reply_target,target_mask)

    # Back propagation
    transformer_optimizer.optimizer.zero_grad()
    Loss.backward()
    transformer_optimizer.step()

    sum_loss += Loss.item()*samples                     # .item() gives value rather than tensor
    count += samples

    if i % 100 == 0:
      # display(f"Epoch [{epoch}][{i}/{len(data_loader)}]           Loss: {sum_loss/count:.3f}")
      tqdm.write(f"Epoch [{epoch}][{i}/{len(data_loader)}] Loss: {sum_loss / count:.3f}")

In [None]:
def evaluate(transformer,question,question_mask,max_len,word2index):
  transformer.eval()

  # creating index to word mapping
  indx2word = {}
  for word,index in word2index.items():
    indx2word[index] = word

  start_token = word2index['<start>']
  words = torch.LongTensor([[start_token]])
  encoded = transformer.encode(question,question_mask)

  for _ in range(max_len):
    size = words.shape[1]
    word_mask = torch.triu(torch.ones(size,size)).transpose(0,1).type(dtype=torch.uint8)
    word_mask = word_mask.unsqueeze(0).unsqueeze(0)
    decoded = transformer.decode(words,encoded,question_mask,word_mask)

    prediction = transformer.logits(decoded[:,-1])
    _,next_word = torch.max(prediction,dim=1)
    next_word = next_word.item()

    if next_word == word2index['<end>']:
      break

    words = torch.cat([words,torch.LongTensor([[next_word]])],dim=1)
    # as more and more word are added words array increases -->(1,len(words))

    # creating sentence
    if words.dim() == 2:
      words = words.squeeze(0)
      words = words.tolist()

    wordindex = [w for w in words if w not in {word2index['<start>'],word2index['<end>'],word2index['<pad>']}]

    sentence = " ".join([indx2word[wordindex[i]] for i in range(len(wordindex))])

    return sentence

In [None]:
epochs = 1

for epoch in range(epochs):
  train(data_loader,transformer,adam_optimizer,kl_loss,epoch)
  state = {'epcoh':epoch,'transformer':transformer,'transformer_optimizer':transformer_optimizer}

  torch.save(state,'checkpoint.pth.tar')



Epoch [0][0/6926] Loss: nan


In [None]:
while(1):
  question = input('ask me anything! : ')
  question = question.lower()
  if question == 'quit' or question == 'Quit' or question == 'end':
    break
  encode_question = [word2indx.get(word,word2indx['<UNKW>']) for word in question.split()]
  question = torch.LongTensor(encode_question).unsqueeze(0)
  question_mask = (question!=0).unsqueeze(1).unsqueeze(1)

  sentence = evaluate(transformer,question,question_mask,10,word2indx)
  print(sentence)