In [1]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.utils.data
import math
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
import re
import random
import os
from torch.utils.tensorboard.writer import SummaryWriter
import shutil

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# def encode_question(words, word_map):
#     enc_c = [word_map.get(word, word_map['<unk>']) for word in words]
#     return enc_c

In [3]:
# def encode_reply(words, word_map):
#     enc_c = [word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in words] + \
#     [word_map['<end>']]
#     return enc_c

In [4]:
# def pad(words, word_map):
#     enc_c = words + [word_map['<pad>']] * (100 - len(words))
#     return enc_c

In [5]:
class Dataset(Dataset):

    def __init__(self):

        with open('clean_data/encoded_graph.json', 'r') as file:
            data_str = file.read().strip('"')
        self.pairs = [int(x) for x in data_str.split()]
        self.bach=100
        self.dataset_size = len(self.pairs)-self.bach
        

    def __getitem__(self, i):
        question = torch.LongTensor(self.pairs[i:i+self.bach])
        reply = torch.LongTensor(self.pairs[i+1:i+self.bach+1])
            
        return question,reply

    def __len__(self):
        return self.dataset_size

In [6]:
train_dataset = Dataset()

In [7]:
input , target = train_dataset.__getitem__(1)

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size = 128, 
                                           shuffle=True, 
                                           pin_memory=True)

In [9]:
def create_masks(reply_input, reply_target):

    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)

      # (batch_size, 1, 1, max_words)

    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data)
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)

    return reply_input_mask, reply_target_mask

In [10]:
class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, d_model, max_len = 1020):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, encoded_words):
        embedding = self.embed(encoded_words) * math.sqrt(self.d_model)
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        embedding = self.dropout(embedding)
        return embedding

In [11]:
class MultiHeadAttention(nn.Module):

    def __init__(self, heads, d_model):

        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted

In [12]:
class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()

        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

In [13]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [14]:
class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, word_map):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = len(word_map)
        self.embed = Embeddings(self.vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
        return src_embeddings
        
    def forward(self, src_words, src_mask):
        encoded = self.encode(src_words, src_mask)
        out = F.log_softmax(self.logit(encoded), dim = 2)
        return out

In [15]:
class AdamWarmup:
    def __init__(self, model_size, warmup_steps, optimizer):

        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0

    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))

    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()

In [16]:
class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss

In [17]:
d_model = 512
heads = 8
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10

with open('clean_data/word_map.json', 'r') as j:
    word_map = json.load(j)
    
transformer = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, word_map = word_map)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(len(word_map), 0.1)



In [18]:
num_iter = len(train_loader)
num_iter

20187

In [19]:
transformer = torch.load("model_save/checkpoint_5.pth.tar")

In [20]:
def train(train_loader, transformer, criterion, epoch):
    log_path = "tensorboard"
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.mkdir(log_path)
    writer = SummaryWriter(log_path)
    transformer.train()
    losses = []
    count = 0

    for i, (question,reply) in enumerate(train_loader):
        
        samples = question.shape[0]

        # Move to device
        question=question.to(device)
        reply = reply.to(device)

        # Prepare Target Data

        # Create mask and add dimensions
        reply_input_mask, reply_target_mask = create_masks(question, reply)

        # Get the transformer outputs
        out = transformer(question, reply_input_mask)

        # Compute the loss
        loss = criterion(out, reply, reply_target_mask)
        transformer_optimizer.optimizer.zero_grad()
        # Backprop
        loss.backward()
        transformer_optimizer.step()
        
        loss_value = loss.item()
        # count += samples
        
        # if i % 1000 == 0:
        print("Epoch {}/{} | Iteration {}/{} | Loss value : {}".format(epoch + 1 , epochs , i , num_iter, loss_value))
        losses.append(loss_value)
        writer.add_scalar("Train/Loss" , np.mean(losses), epoch*num_iter + i)

In [None]:
for epoch in range(epochs):
    train(train_loader, transformer, criterion, epoch)
    # state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    # file_name = os.path.join('model_save', 'checkpoint_' + str(epoch) + '.pth.tar')
    # torch.save(state, file_name)

Epoch 1/10 | Iteration 0/20187 | Loss value : 1.0963610410690308
Epoch 1/10 | Iteration 1/20187 | Loss value : 1.0964769124984741
Epoch 1/10 | Iteration 2/20187 | Loss value : 1.137054204940796
Epoch 1/10 | Iteration 3/20187 | Loss value : 1.1022404432296753
Epoch 1/10 | Iteration 4/20187 | Loss value : 1.102308750152588
Epoch 1/10 | Iteration 5/20187 | Loss value : 1.106292724609375
Epoch 1/10 | Iteration 6/20187 | Loss value : 1.0935697555541992
Epoch 1/10 | Iteration 7/20187 | Loss value : 1.1028720140457153
Epoch 1/10 | Iteration 8/20187 | Loss value : 1.1042364835739136
Epoch 1/10 | Iteration 9/20187 | Loss value : 1.1409332752227783
Epoch 1/10 | Iteration 10/20187 | Loss value : 1.1462090015411377
Epoch 1/10 | Iteration 11/20187 | Loss value : 1.1002463102340698
Epoch 1/10 | Iteration 12/20187 | Loss value : 1.1356818675994873
Epoch 1/10 | Iteration 13/20187 | Loss value : 1.129370093345642
Epoch 1/10 | Iteration 14/20187 | Loss value : 1.134246826171875
Epoch 1/10 | Iteration 15

In [93]:
state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
file_name = os.path.join('model_save', 'checkpoint_5.pth.tar')
torch.save(transformer, file_name)

In [291]:
# with open('New folder/word_map.json', 'r') as file:
#     word_map = json.load(file)

In [23]:
def encode_question(words, word_map):
    return [word_map.get(word.lower(), word_map['<unk>']) for word in words]

In [77]:
question_text = "i'm really worried about you , where have you been?"
question_tokens = question_text.lower().split()  # Simple tokenization
encoded_question = [word_map.get(token, word_map.get("<unk>")) for token in question_tokens]
encoded_question = [token for token in encoded_question if token is not None]  # Filter out None values if <unk> is not used
encoded_question.append(word_map['</s>'])

In [78]:
rev_word_map = {v: k for k, v in word_map.items()}

In [79]:
data = torch.tensor([encoded_question], dtype=torch.long).to(device)  # Convert to tensor and add batch dimension
# reply_input_mask, _ = create_masks(data, data)  # Generate mask
# out = transformer(data, reply_input_mask)

In [80]:
max_length = 50  # Maximum length of the generated sentence
end_token = word_map.get('</end>')  # Assuming you have an <end> token defined
hoanthanh = [] 
for _ in range(max_length):
    reply_input_mask, _ = create_masks(data, data)  # Generate mask for current input
    out = transformer(data, reply_input_mask)  # Model prediction
    prob = torch.softmax(out[:, -1, :], dim=-1)  # Softmax on the last token's output
    _, next_word = torch.max(prob, dim=1)
    next_word = next_word.item()

    # Break if <end> token is predicted
    if next_word == end_token:
        break

    # Append the predicted token to the sequence
    data = torch.cat([data, torch.tensor([[next_word]], device=device)], dim=1)
    hoanthanh.append(next_word)

# Decoding the generated sequence
decoded_reply = ' '.join([rev_word_map.get(idx, '<unk>') for idx in hoanthanh])

In [81]:
decoded_reply

'You dont believe in anything ?'