In [4]:
%load_ext autoreload
%autoreload 2

In [96]:
# Device-independent code
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [400]:
# Common Variables
MAX_LENGTH = 20
MIN_LENGTH = 3

# Load Dataset

In [401]:
import pickle
with open('all_responses_simple.pkl', 'rb') as f:
    pairs = pickle.load(f)

# Preprocess

In [402]:
len(pairs[0][0].split())

10

In [403]:
# filter
pairs = [pair for pair in pairs if (len(pair[0].split()) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH)]
pairs = [pair for pair in pairs if (len(pair[0].split()) > MIN_LENGTH and len(pair[1].split(' ')) > MIN_LENGTH)]
print(len(pairs))

2651


In [404]:
pairs

[['I should have known that you would be here...Professor McGonagall.',
  'Good evening, Professor Dumbledore. Are the rumours true, Albus?'],
 ['Good evening, Professor Dumbledore. Are the rumours true, Albus?',
  "I'm afraid so, Professor. The good, and the bad."],
 ['Hagrid is bringing him.',
  'Do you think it wise to trust Hagrid with something as important as this?'],
 ['Do you think it wise to trust Hagrid with something as important as this?',
  'Ah, Professor, I would trust Hagrid with my life.'],
 ['Ah, Professor, I would trust Hagrid with my life.',
  'Professor Dumbledore, Sir. Professor McGonagall.'],
 ['Professor Dumbledore, Sir. Professor McGonagall.',
  'No problems, I trust, Hagrid?'],
 ['The only family he has.',
  "This boy will be famous. There won't be a child in our world who doesn't know his name."],
 ["This boy will be famous. There won't be a child in our world who doesn't know his name.",
  "Exactly. He's better off growing up away from all that. Until he is r

## Check for multiple sentences

In [405]:
from nltk.tokenize import sent_tokenize
print(pairs[0][0])
print(sent_tokenize(pairs[0][0]))

I should have known that you would be here...Professor McGonagall.
['I should have known that you would be here...Professor McGonagall.']


In [406]:
import re

def preprocess(text):
    # remove punctuation and numbers with a regular expression
    text = re.sub(r'[.?!,:;\'()\-\n\d]','', text.lower().strip())
    return text

In [363]:
# tag input sequence with START and END for decoder
def tag_sequence(text):
    bos = "<START> "
    eos = " <END>"
    tagged_text = bos + text + eos
    return tagged_text

In [407]:
for index, pair in enumerate(pairs):
    # preprocess
    pair[0] = preprocess(pair[0])
    pair[1] = preprocess(pair[1])
    # tag sequences
    #pair[0] = tag_sequence(pair[0])


In [408]:
pairs[50:55]

[['welcome back mr potter welcome back',
  'doris crockford mr potter i cant believe im meeting you at last'],
 ['doris crockford mr potter i cant believe im meeting you at last',
  'harry ppotter ccant tell you how pleased i am to meet you'],
 ['oh nice to meet you',
  'ffearfully fascinating subject nnot that you need it eeh potter heheh'],
 ['ffearfully fascinating subject nnot that you need it eeh potter heheh',
  'yes well must be going now lots to buy heh'],
 ['see harry youre famous',
  'but why am i famous hagrid all those people back there how is it they know who i am']]

## Create Vocab

- Add each unique word to vocab set

In [409]:
from nltk import word_tokenize

def create_vocab(doc):
    vocab = {}

    for pair in doc:
        sent1 = pair[0]
        sent2 = pair[1]

        for t in sent1.split():
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1
        
        for t in sent2.split():
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1
        
    return vocab

In [410]:
vocab = create_vocab(pairs)

In [411]:
PAD_TOKEN = "<PAD>"
END_TOKEN = "<EOS>"
START_TOKEN = "<SOS>"

if PAD_TOKEN not in vocab:
    vocab[PAD_TOKEN] = 1
if START_TOKEN not in vocab:
    vocab[START_TOKEN] = 1
if END_TOKEN not in vocab:
    vocab[END_TOKEN] = 1

In [412]:
word2idx = {}
idx2word = {}

for i, word in enumerate(list(vocab.keys())):
    word2idx[word] = i
    idx2word[i] = word

In [413]:
print(pairs[:5])

[['i should have known that you would be hereprofessor mcgonagall', 'good evening professor dumbledore are the rumours true albus'], ['good evening professor dumbledore are the rumours true albus', 'im afraid so professor the good and the bad'], ['hagrid is bringing him', 'do you think it wise to trust hagrid with something as important as this'], ['do you think it wise to trust hagrid with something as important as this', 'ah professor i would trust hagrid with my life'], ['ah professor i would trust hagrid with my life', 'professor dumbledore sir professor mcgonagall']]


In [414]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

# Try Alternate Pairings

In [415]:
pairs1 = [pairs[i] for i in range(len(pairs)) if i % 2 == 0]
pairs2 = [pairs[i] for i in range(len(pairs)) if i % 2 == 1]

In [417]:
pairs1[0:5]

[['i should have known that you would be hereprofessor mcgonagall',
  'good evening professor dumbledore are the rumours true albus'],
 ['hagrid is bringing him',
  'do you think it wise to trust hagrid with something as important as this'],
 ['ah professor i would trust hagrid with my life',
  'professor dumbledore sir professor mcgonagall'],
 ['the only family he has',
  'this boy will be famous there wont be a child in our world who doesnt know his name'],
 ['exactly hes better off growing up away from all that until he is ready',
  'there there hagrid its not really goodbye after all']]

In [418]:
pairs2[0:5]

[['good evening professor dumbledore are the rumours true albus',
  'im afraid so professor the good and the bad'],
 ['do you think it wise to trust hagrid with something as important as this',
  'ah professor i would trust hagrid with my life'],
 ['professor dumbledore sir professor mcgonagall',
  'no problems i trust hagrid'],
 ['this boy will be famous there wont be a child in our world who doesnt know his name',
  'exactly hes better off growing up away from all that until he is ready'],
 ['up get up  now', 'wake up cousin were going to the zoo']]

In [452]:
# Encoding: convert words -> numbers using word2idx
converted_pairs = []

for pair in pairs2:
    sent1 = pair[0]
    sent2 = pair[1]
    sent1_converted = []
    sent2_converted = []

    for word in sent1.split(' '):
        if len(word) > 0:
            sent1_converted.append(word2idx[word])
    for word in sent2.split(' '):
        if len(word) > 0:
            sent2_converted.append(word2idx[word])

    converted_pairs.append([sent1_converted, sent2_converted])

print(converted_pairs[0:5])

[[[10, 11, 12, 13, 14, 15, 16, 17, 18], [19, 20, 21, 12, 15, 10, 22, 15, 23]], [[28, 5, 29, 30, 31, 32, 33, 24, 34, 35, 36, 37, 36, 38], [39, 12, 0, 6, 33, 24, 34, 40, 41]], [[12, 13, 42, 12, 9], [43, 44, 0, 33, 24]], [[38, 49, 50, 7, 51, 52, 53, 7, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 67, 68, 69, 70, 71, 72, 4, 73, 47, 25, 74]], [[69, 80, 69, 81], [82, 69, 83, 84, 85, 32, 15, 86]]]


In [453]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, word2idx):
        self.pairs = pairs
        self.word2idx = word2idx

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, ita = self.pairs[idx]
        phrase = torch.tensor([self.word2idx[word] for word in eng.split() if len(word) > 1]
                                  + [self.word2idx[END_TOKEN]], dtype=torch.long)
        response = torch.tensor([self.word2idx[word] for word in ita.split() if len(word) > 1]
                                  + [self.word2idx[END_TOKEN]], dtype=torch.long)
        return phrase, response

# Custom collate function to handle padding
def collate_fn(batch):
    eng_batch, ita_batch = zip(*batch)
    eng_batch_padded = pad_sequence(eng_batch, batch_first=True, padding_value=word2idx[PAD_TOKEN])
    ita_batch_padded = pad_sequence(ita_batch, batch_first=True, padding_value=word2idx[PAD_TOKEN])
    return eng_batch_padded, ita_batch_padded

In [454]:
# Create the dataset and DataLoader
sequence_pair_dataset = TranslationDataset(pairs2, word2idx)
batch_size = 32
sequence_pair_dataloader = DataLoader(sequence_pair_dataset, batch_size=batch_size,
                                    shuffle=True,  drop_last=True, collate_fn=collate_fn)

print("samples: ", len(sequence_pair_dataset))
print("batches: ", len(sequence_pair_dataloader))

samples:  1325
batches:  41


In [455]:
# Example: iterating over the DataLoader
for encoder_input, decoder_output in sequence_pair_dataloader:
    print("Encoder batch:", encoder_input)
    print("Decoder batch:", decoder_output)
    break # remove this to iterate over the whole dataset

Encoder batch: tensor([[2846,  362,  233, 2028, 3716, 3714, 3714, 3714, 3714, 3714, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [  21,   75,   76, 2736,  131,   42, 3716, 3714, 3714, 3714, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [ 234,  366,  328,    5,   56,   88, 3716, 3714, 3714, 3714, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [ 111,    5,  539,    9,   84,   76,  807,   32,  349,   15, 1367, 1167,
           94, 1368, 3716, 3714, 3714, 3714, 3714, 3714],
        [ 118,  193, 1360,   43, 2104,   43, 2105,  227, 2106, 3716, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [  76,  760,  203,  176,   27,   14,    5, 3716, 3714, 3714, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [   5,   14,   76,  257, 3153,  819, 3716, 3714, 3714, 3714, 3714, 3714,
         3714, 3714, 3714, 3714, 3714, 3714, 3714, 3714],
        [ 12

# Seq2Seq Model using LSTM

In [456]:
import torch
import torch.nn as nn

In [457]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)

    def forward(self, x):
        # Reversing the sequence of indices
        x = torch.flip(x, [1])
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [458]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x)
        out, (hidden, cell) = self.lstm(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

In [459]:
# Hyperparameters
vocab_size = len(vocab)
embed_size = 256
hidden_size = 512
num_layers = 2

# Initialize the models
encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers).to(DEVICE)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers).to(DEVICE)

# Train Model

In [460]:
len(sequence_pair_dataloader)

41

In [461]:
import torch.optim as optim
import torch.nn as nn
import random

# Loss Function (exclude padding)
loss_fn = nn.CrossEntropyLoss(ignore_index=word2idx[PAD_TOKEN])

# Optimizers
encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())

# Number of epochs
num_epochs = 50

# Training Loop
encoder.train()
decoder.train()

for epoch in range(num_epochs):
    print(f'epoch: {epoch}')
    for i, (input_tensor, target_tensor) in enumerate(sequence_pair_dataloader):
        input_tensor, target_tensor = input_tensor.to(DEVICE), target_tensor.to(DEVICE)
        # Zero gradients of both optimizers
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        target_length = target_tensor.size(1)

        # Encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Decoder
        decoder_input = torch.full((batch_size, 1), word2idx[START_TOKEN], dtype=torch.long).to(DEVICE)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        # Randomly select a word index from the target sequence
        random_word_index = random.randint(0, target_length - 1)

        loss = 0

        for di in range(target_length):
            logits, decoder_hidden, decoder_cell  = decoder(decoder_input, decoder_hidden, decoder_cell)
            loss += loss_fn(logits, target_tensor[:,di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)  # Teacher forcing

        # Backpropagation
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()


epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
epoch: 20
epoch: 21
epoch: 22
epoch: 23
epoch: 24
epoch: 25
epoch: 26
epoch: 27
epoch: 28
epoch: 29
epoch: 30
epoch: 31
epoch: 32
epoch: 33
epoch: 34
epoch: 35
epoch: 36
epoch: 37
epoch: 38
epoch: 39
epoch: 40
epoch: 41
epoch: 42
epoch: 43
epoch: 44
epoch: 45
epoch: 46
epoch: 47
epoch: 48
epoch: 49


In [462]:
# Test model
def translate(encoder, decoder, sentence, word2idx, max_length=15):
    encoder.eval()
    decoder.eval()
    with torch.inference_mode():
        # Tokenize and encode the sentence
        input_tensor = torch.tensor([word2idx[word] for word in sentence.split() if len(word) > 1]
                                    + [word2idx[END_TOKEN]], dtype=torch.long)
        input_tensor = input_tensor.view(1, -1).to(DEVICE)  # batch_first=True

        # Pass the input through the encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Initialize the decoder input with the SOS token
        decoder_input = torch.tensor([[word2idx[START_TOKEN]]], dtype=torch.long)  # SOS
        # Initialize the hidden state of the decoder with the encoder's hidden state
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        # Decoding the sentence
        decoded_words = []
        last_word = torch.tensor([[word2idx[START_TOKEN]]]).to(DEVICE)
        for di in range(max_length):
            logits, decoder_hidden, decoder_cell = decoder(last_word, decoder_hidden, decoder_cell)
            next_token = logits.argmax(dim=1) # greedy
            last_word = torch.tensor([[next_token]]).to(DEVICE)
            if next_token.item() == word2idx[END_TOKEN]:
                break
            else:
                decoded_words.append(idx2word.get(next_token.item()))
        return ' '.join(decoded_words)

In [469]:
sentence = "okay what do you suggest"
translated_sentence = translate(encoder, decoder, sentence, word2idx)
print("Response:", translated_sentence)

Response: no what do they leave you with his cup now


In [464]:
import pickle
# pickle model
full_simple_model_v1 = {'encoder': encoder,
               'decoder': decoder
            }

with open('full_simple_model_v1.pkl', 'wb') as f:
    pickle.dump(full_simple_model_v1, f)

In [470]:
class User:

    def __init__(self, name, age) -> None:
        self.name = name
        self.age = age
        self.likes = []
        self.dislikes = []
    
    def add_like(self, like):
        self.likes.append(like)
    
    def add_dislike(self, dislike):
        self.dislikes.append(dislike)

In [476]:
user_base = {}
with open('user_base.pkl', 'wb') as f:
    pickle.dump(user_base, f)

In [479]:
import secrets
from nltk import word_tokenize
from nltk import pos_tag

bot_name = "muggle-bot"

def get_user():
    global name
    print(f'{bot_name}: Hello! What is your name?')
    name = input()

    old_user_greetings = ['Welcome back ', 'Good to see you again, ', 'Long time no see, ']
    new_user_greetings = ['Hello there, ', 'Hi,', 'Nice to meet you,']

    # Load data (deserialize)
    with open('user_base.pkl', 'rb') as handle:
        user_base = pickle.load(handle)

    if name in user_base:
        user_obj = user_base[name]
        if user_obj.likes:
            print(f'{bot_name}: {secrets.choice(old_user_greetings)} {name}! Hope to talk more about {secrets.choice(user_obj.likes)} today!')
        else:
            print(f'{bot_name}: {secrets.choice(old_user_greetings)}{name}! How can I help you today?')
    else:
        # construct new Person object and store information
        print(f'{bot_name}: {secrets.choice(new_user_greetings)}, {name}! How old are you? (Enter a number)')
        age = input(f'{name}: ')
        user_obj = User(name, age)
        user_base[name] = user_obj
    return (user_base, user_obj)

def chatbot_loop(user_base, user_obj):
    bot_response = ""
    wizard_mode = False
    print(f"Hello {name}. Great meeting you! I can talk to you, but I can also connect you with a wizard.")
    print(f"Enter \'wizard\' to talk to a wizard from the Harry Potter world.")
    print(f"Enter \'quit\' to talk to a wizard from the Harry Potter world.")

    like_phrases = ['i like', 'i love', 'i adore', 'my favorite']
    dislike_phrases = ['i hate', 'i do not like', 'i dislike', 'not particularly']

    # retrieve model
    with open('full_simple_model_v1.pkl', 'rb') as f:
        model = pickle.dump(full_simple_model_v1, f)
    encoder = model['encoder']
    decoder = model['decoder']
    while True:
        # user input
        user_input = input(f'{name}: ')

        # if it contains like or dislike message from user
        if not wizard_mode:
            for l in like_phrases:
                if l in user_input.lower():
                    # add to user likes
                    tokens = word_tokenize(user_input)
                    tags = pos_tag(tokens)
                    for i in reversed(tags):
                        if i[1] == 'NN' or i[1] == 'NNS':
                            user_obj.add_like(i[0])
                            break
                    print(f'{bot_name}: Haha who does\'t')
                    break

            for l in dislike_phrases:
                if l in user_input:
                    # add to user likes
                    tokens = word_tokenize(user_input)
                    tags = pos_tag(tokens)
                    for i in reversed(tags):
                        if i[1] == 'NN' or i[1] == 'NNS':
                            if i[0] in user_obj.dislikes:
                                print("I am aware! We don't have to talk about it further.")
                                break
                            user_obj.add_dislike(i[0])
                            print(f'{bot_name}: I will keep that in mind!')
                            break
                    break

        if user_input == 'quit':
            # save info before quitting
            # pickle updated dictionary
            with open('user_base.pkl', 'wb') as handle:
                pickle.dump(user_base, handle, protocol=pickle.HIGHEST_PROTOCOL)
            break
        if user_input == 'wizard':
            print('You are connected with a wizard...')
            print(f'wizard: Greetings...')
            wizard_mode = True
            continue

        if wizard_mode == True:
            bot_response = translate(encoder, decoder, user_input, word2idx)
            print(f'wizard: {bot_response}')
        else:
            print(f'{bot_name}: {bot_response}')

    print(f'{bot_name}: Have a nice day!')

if __name__ == "__main__":
    user_base, user_obj = get_user()
    chatbot_loop(user_base, user_obj, encoder, decoder)

muggle-bot: Hello! What is your name?
muggle-bot: Hello there, , sanjith! How old are you? (Enter a number)
Hello sanjith. Great meeting you! I can talk to you, but I can also connect you with a wizard.
Enter 'wizard' to talk to a wizard from the Harry Potter world.
Enter 'quit' to talk to a wizard from the Harry Potter world.
wizard: 
muggle-bot: 
wizard: 
muggle-bot: 
wizard: 
muggle-bot: 
