# 52100674 TranThiVen


You are given a source code for chatbot problem using Tensorflow.
Convert the code into pytorch.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import random

In [3]:
# Read the CSV file
df = pd.read_csv('chatbot.csv')
questions = list(df['question'])
answers = list(df['answer'])
print(questions[:2])
print(answers[:2])

['have you read the communist', 'what is a government']
['yes, marx had made some interesting observations.', 'ideally it is a representative of the people.']


In [4]:
SOS_token = 0
EOS_token = 1
class Vocab:
    def __init__(self):
        self.word2index = {"": SOS_token, "": EOS_token}
        self.index2word = {SOS_token: "", EOS_token: ""}
        self.words_count = len(self.word2index)

    def add_words(self, sentence):
        for word in sentence.split(" "):
            if word not in self.word2index:
                self.word2index[word] = self.words_count
                self.index2word[self.words_count] = word
                self.words_count += 1

In [5]:
import nltk
import pandas as pd
import string
import torch
import torchtext

In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def getDict(dataPipe):
    data_dict = {
        'question': [],
        'answer': []
    }
    for _, question, answers, _ in dataPipe:
        data_dict['question'].append(question)
        data_dict['answer'].append(answers[0])
    return data_dict

In [8]:
def loadDF(path):
    # load data
    train_data, val_data = torchtext.datasets.SQuAD1(path)
    # convert dataPipe to dictionary
    train_dict, val_dict = getDict(train_data), getDict(val_data)
    # convert Dictionaries to Pandas DataFrame
    train_df = pd.DataFrame(train_dict)
    validation_df = pd.DataFrame(val_dict)
    return train_df.append(validation_df)

In [9]:
def prepare_text(sentence):
    # clean text and tokenize it
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    return tokens

In [10]:
def prepare_text(sentence):
    # clean text and tokenize it
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    return tokens

In [11]:
def prepare_text(sentence):
    # clean text and tokenize it
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    return tokens

In [12]:
def toTensor(vocab, sentence):
    # convert list of words "sentence" to a torch tensor of indices
    indices = [vocab.word2index[word] for word in sentence.split(' ')]
    indices.append(vocab.word2index[''])
    return torch.Tensor(indices).long().to(device).view(-1, 1)

In [13]:
def getPairs(df):
    # convert df to list of pairs
    temp1 = df['question'].apply(lambda x: " ".join(x) ).to_list()
    temp2 = df['answer'].apply(lambda x: " ".join(x) ).to_list()
    return [list(i) for i in zip(temp1, temp2)]

In [14]:
def getMaxLen(pairs):
    max_src = 0
    max_trg = 0
    for p in pairs:
        max_src = len(p[0].split()) if len(p[0].split()) > max_src else max_src
        max_trg = len(p[1].split()) if len(p[1].split()) > max_trg else max_trg
    return max_src, max_trg

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(self.input_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)

    def forward(self, x, hidden, cell_state):
        x = self.embedding(x)
        x = x.view(1, 1, -1)
        x, (hidden, cell_state) = self.lstm(x, (hidden, cell_state))
        return x, hidden, cell_state

In [17]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim= 1)

    def forward(self, x, hidden, cell_state):
        x = self.embedding(x)
        x = x.view(1, 1, -1)
        x, (hidden, cell_state) = self.lstm(x, (hidden, cell_state))
        x = self.softmax(self.fc(x[0]))
        return x, hidden, cell_state

In [18]:
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.encoder = Encoder(self.input_size, self.hidden_size)
        self.decoder = Decoder(self.hidden_size, self.output_size)

    def forward(self, src, trg, src_len, trg_len, teacher_force=1):
        output = {
            'decoder_output':[]
        }
        encoder_hidden = torch.zeros([1, 1, self.hidden_size]).to(device) # 1 = number of LSTM layers
        cell_state = torch.zeros([1, 1, self.hidden_size]).to(device)
        for i in range(src_len):
            encoder_output, encoder_hidden, cell_state = self.encoder(src[i], encoder_hidden, cell_state)

        decoder_input = torch.Tensor([[0]]).long().to(device) # 0 = SOS_token
        decoder_hidden = encoder_hidden
        for i in range(trg_len):
            decoder_output, decoder_hidden, cell_state = self.decoder(decoder_input, decoder_hidden, cell_state)
            output['decoder_output'].append(decoder_output)
            if self.training: # Model not in eval mode
                decoder_input = target_tensor[i] if random.random() > teacher_force else decoder_output.argmax(1) # teacher forcing
            else:
                _, top_index = decoder_output.data.topk(1)
                decoder_input = top_index.squeeze().detach()
        return output

In [19]:
import torch
import torch.nn as nn
from sklearn.model_selection import KFold

In [20]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
def train(source_data, target_data, model, epochs, batch_size, print_every, learning_rate):
    model.to(device)
    total_training_loss = 0
    total_valid_loss = 0
    loss = 0
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    # use cross validation
    kf = KFold(n_splits=epochs, shuffle=True)

    for e, (train_index, test_index) in enumerate(kf.split(source_data), 1):
        model.train()
        for i in range(0, len(train_index)):
            src = source_data[i]
            trg = target_data[i]
            output = model(src, trg, src.size(0), trg.size(0))
            current_loss = 0
            for (s, t) in zip(output["decoder_output"], trg):
                current_loss += criterion(s, t)
            loss += current_loss
            total_training_loss += (current_loss.item() / trg.size(0)) # add the iteration loss

            if i % batch_size == 0 or i == (len(train_index)-1):
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                loss = 0

        # validation set
        model.eval()
        for i in range(0, len(test_index)):
            src = source_data[i]
            trg = target_data[i]
            output = model(src, trg, src.size(0), trg.size(0))
            current_loss = 0
            for (s, t) in zip(output["decoder_output"], trg):
                current_loss += criterion(s, t)
            total_valid_loss += (current_loss.item() / trg.size(0)) # add the iteration loss

        if e % print_every == 0:
            training_loss_average = total_training_loss / (len(train_index)*print_every)
            validation_loss_average = total_valid_loss / (len(test_index)*print_every)
            print("{}/{} Epoch  -  Training Loss = {:.4f}  -  Validation Loss = {:.4f}".format(e, epochs, training_loss_average, validation_loss_average))
            total_training_loss = 0
            total_valid_loss = 0

In [21]:
def evaluate(src, Q_vocab, A_vocab, model, target_max_len):
    try:
        src = toTensor(Q_vocab, " ".join(prepare_text(src)))
    except:
        print("Error: Word Encountered Not In The Vocabulary.")
        return
    answer_words = []
    output = model(src, None, src.size(0), target_max_len)
    for tensor in output['decoder_output']:
        _, top_token = tensor.data.topk(1)
        if top_token.item() == 1:
            break
        else:
            word = A_vocab.index2word[top_token.item()]
            answer_words.append(word)
    print("<", ' '.join(answer_words), "\n")

In [22]:
data_df = pd.read_csv('chatbot.csv')
data_df['question'] = data_df['question'].apply(prepare_text)
data_df['answer'] = data_df['answer'].apply(prepare_text)
print(data_df['question'][1])

['what', 'is', 'a', 'government']


In [23]:
pairs = getPairs(data_df)

In [24]:
max_src, max_trg = getMaxLen(pairs)
max_trg, max_src

(34, 15)

In [25]:
Q_vocab = Vocab()
A_vocab = Vocab()

for pair in pairs:
    Q_vocab.add_words(pair[0])
    A_vocab.add_words(pair[1])

In [29]:
source_data = [toTensor(Q_vocab, pair[0]) for pair in pairs]
target_data = [toTensor(A_vocab, pair[1]) for pair in pairs]

In [30]:
learning_rate = 0.01
hidden_size = 128 # encoder and decoder hidden size
batch_size = 50
epochs = 100

In [31]:
seq2seq = Seq2Seq(Q_vocab.words_count, hidden_size, A_vocab.words_count)

train(source_data = source_data,
      target_data = target_data,
      model = seq2seq,
      print_every = 5,
      epochs = epochs,
      learning_rate = learning_rate,
      batch_size = batch_size)

5/100 Epoch  -  Training Loss = 6.5679  -  Validation Loss = 6.6180
10/100 Epoch  -  Training Loss = 5.5237  -  Validation Loss = 6.1712
15/100 Epoch  -  Training Loss = 5.1105  -  Validation Loss = 5.7772
20/100 Epoch  -  Training Loss = 4.9115  -  Validation Loss = 5.5417
25/100 Epoch  -  Training Loss = 4.7085  -  Validation Loss = 5.4076
30/100 Epoch  -  Training Loss = 4.5295  -  Validation Loss = 5.2074
35/100 Epoch  -  Training Loss = 4.3283  -  Validation Loss = 5.0661
40/100 Epoch  -  Training Loss = 4.1272  -  Validation Loss = 4.8144
45/100 Epoch  -  Training Loss = 3.9183  -  Validation Loss = 4.6252
50/100 Epoch  -  Training Loss = 3.7110  -  Validation Loss = 4.3881
55/100 Epoch  -  Training Loss = 3.4980  -  Validation Loss = 4.1797
60/100 Epoch  -  Training Loss = 3.2978  -  Validation Loss = 3.9765
65/100 Epoch  -  Training Loss = 3.0963  -  Validation Loss = 3.8268
70/100 Epoch  -  Training Loss = 2.9758  -  Validation Loss = 3.5663
75/100 Epoch  -  Training Loss = 2.

In [None]:
import torch
model_path = 'seq2seq.pt'
torch.save(seq2seq, model_path)
seq2seq = torch.load(model_path, map_location=torch.device('mps'))
seq2seq.eval()

In [34]:
print("Type 'exit' to finish the chat.\n", "-"*30, '\n')
while (True):
    src = input("> ")
    if src.strip() == "exit":
        break
    evaluate(src, Q_vocab, A_vocab, seq2seq, max_trg)

Type 'exit' to finish the chat.
 ------------------------------ 

> hello
< greetings 

> his
Error: Word Encountered Not In The Vocabulary.
> name
< i you mean the the 

> exit
