In [1]:
!pip install -U gensim

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: gensim in /opt/conda/lib/python3.7/site-packages (4.1.2)


In [2]:
import gensim
import nltk
from nltk.corpus import brown
import numpy as np
import pandas as pd
import gzip
from nltk.stem.porter import *
from nltk.stem import *
from nltk.tokenize import RegexpTokenizer
import torch.utils.data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd

nltk.download('brown')
nltk.download('punkt')
model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')
print("Before adding tokens: ", len(w2v.wv))
w2v.wv.add_vectors(['SOS','EOS'],[np.zeros(100),np.zeros(100)])
print("After adding tokens: ", len(w2v.wv))

embedding_weights = torch.FloatTensor(w2v.wv.vectors)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('qa_Home_and_Kitchen.json.gz')

def prepare_text(sentence):
    
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    tokens.append('EOS')
    tokens.insert(0,'SOS')
    for x in range(10-len(tokens)):
        tokens += '0'
        
    
    return tokens

def filter_pairs(src, trg):
    tokenizer = RegexpTokenizer(r'\w+')
    return len(tokenizer.tokenize(src)) < 8 and len(tokenizer.tokenize(trg)) < 8



def train_test_split(SRC, TRG):
    
    tf = []
    for x in zip(SRC, TRG):
        if filter_pairs(x[0],x[1]):
            tf.append(True)
        else:
            tf.append(False)
    
    SRC = SRC[tf]
    TRG = TRG[tf]
            
    
    SRC_clean = [prepare_text(sentence) for sentence in SRC]
    TRG_clean = [prepare_text(sentence) for sentence in TRG]
    
    SRC_train_size = int(0.8 * len(SRC_clean))
    SRC_test_size = len(SRC_clean) - SRC_train_size
    SRC_train_dataset, SRC_test_dataset = torch.utils.data.random_split(SRC_clean, [SRC_train_size, SRC_test_size])
    
    TRG_train_size = int(0.8 * len(TRG_clean))
    TRG_test_size = len(TRG_clean) - TRG_train_size
    TRG_train_dataset, TRG_test_dataset = torch.utils.data.random_split(TRG_clean, [TRG_train_size, TRG_test_size])
    
    return SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset

SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset = train_test_split(df['question'],df['answer'])
   

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Before adding tokens:  15173
After adding tokens:  15175


In [3]:
class Encoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.hidden = torch.zeros(1, 1, hidden_size)
        
        self.embedding = nn.Embedding(self.input_size, 100) # 100 corresponds to the embedding dimension
        self.embedding = self.embedding.from_pretrained(embedding_weights, freeze=False)
        
        self.lstm = nn.LSTM(100, self.hidden_size, 1) 
        
    
    def forward(self, i):
        
        o, (h, c) = self.lstm(self.embedding(i)) # i.view(1,1, self.input_size), self.embedding(i.view(1,1,self.input_size))
        
        return o, h, c
    

class Decoder(nn.Module):
      
    def __init__(self, hidden_size, output_size):
        
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(self.hidden_size, self.hidden_size)
        self.embedding = self.embedding.from_pretrained(embedding_weights, freeze=False)
        
        self.lstm = nn.LSTM(100, self.hidden_size)
        
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, i, h):
        
        output = self.embedding(i).view(1, 1, -1)
        output = F.relu(output)
        output, h = self.lstm(output, h)
        output = self.out(output.squeeze(0))
        output = self.softmax(output)
        
        return output, h
        
        

class Seq2Seq(nn.Module):
    
    def __init__(self, encoder_input_size, encoder_hidden_size, decoder_hidden_size, decoder_output_size):
        
        super(Seq2Seq, self).__init__()
        
        self.hidden1 = (torch.zeros(1, 1, encoder_hidden_size), torch.zeros(1, 1, encoder_hidden_size)) #(torch.zeros(1, 1, 20),torch.zeros(1, 1, 20))
        
        self.encoder = Encoder(encoder_input_size, encoder_hidden_size)
        
        self.hidden2 = (torch.zeros(1, 1, decoder_hidden_size),
                        torch.zeros(1, 1, decoder_hidden_size))
        
        self.decoder = Decoder(decoder_hidden_size,decoder_output_size)
    
    
    def forward(self, src, trg, train=False):
        
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)

        encoder_outputs, hidden, cell = self.encoder(src)
        
        i = trg[0,:]
        for t in range(1, trg_len):
            
            output, (hidden, cell) = self.decoder(i, (hidden, cell))
            
            top1 = output.argmax(1)
            
            outputs[t] = output
            
            
            if train:
                i = trg[t]
            else:
                i = top1
        
        return outputs


def get_vector_index(word):
    
    try:
        val = w2v.wv.key_to_index[word]
    except:
        val = 0
    return val

def training_sentence_tensors(SRC, TRG):
    
    src_sentence_index = [get_vector_index(word) for word in SRC]
    trg_sentence_index = [get_vector_index(word) for word in TRG]
    src_tensor = torch.tensor(src_sentence_index, dtype=torch.long).view(-1, 1)
    trg_tensor = torch.tensor(trg_sentence_index, dtype=torch.long).view(-1, 1)
    return (src_tensor, trg_tensor)
    


In [4]:
import torch.optim as optim

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        


model = Seq2Seq(10, 20, 20, len(w2v.wv))
model.apply(init_weights)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)


for epoch in range(5):
    h = model.hidden1
    loss_avg = 0
    print("Epoch: ", epoch)
    for i in range(len(SRC_train_dataset)):
        
        model.zero_grad()

        src, trg = training_sentence_tensors(SRC_train_dataset[i], TRG_train_dataset[i])
        
        o = model(src, trg, train=True)
        
        o_dim = o.shape[-1]
        
        o = o[1:].view(-1, o_dim)
        trg = trg[1:].view(-1)
                
        loss = loss_function(o, trg)
        loss_avg += loss
        loss.backward()
        optimizer.step()

    print("Epoch Loss: ", loss_avg/len(SRC_train_dataset))
    loss_avg = 0

Epoch:  0
Epoch Loss:  tensor(2.2102, grad_fn=<DivBackward0>)
Epoch:  1
Epoch Loss:  tensor(1.9300, grad_fn=<DivBackward0>)
Epoch:  2
Epoch Loss:  tensor(1.8259, grad_fn=<DivBackward0>)
Epoch:  3
Epoch Loss:  tensor(1.7644, grad_fn=<DivBackward0>)
Epoch:  4
Epoch Loss:  tensor(1.7217, grad_fn=<DivBackward0>)


In [5]:
def generate_response(sentence):
    
    sentence = prepare_text(sentence)
    sentence_index = [get_vector_index(word) for word in sentence]
    tensor = torch.tensor(sentence_index, dtype=torch.long).view(-1, 1)
    response = model.forward(tensor, tensor)
    
    response = [int(o.argmax(1)[0]) for o in model(tensor, tensor)]
    response = [w2v.wv.index_to_key[x] for x in response]
    
    return response


def chat():
    
    while(True):
       # try:
            
            sentence = input("You: ")
            
            if sentence == 'exit':
                break
            else:
                response = generate_response(sentence)
                print("HomeHelper: ", ' '.join(response) )
                
        # except Exception as e:
          #  print(e)
          #  print("Error: Unknown vocabulary word.")
chat()

You: Hi
HomeHelper:  the Yes EOS 0 0 0 0 0 0 0
You: What can you do?
HomeHelper:  the Yes EOS 0 0 0 0 0 0 0
You: Help m
HomeHelper:  the Yes EOS 0 0 0 0 0 0 0
You: Bye
HomeHelper:  the Yes EOS 0 0 0 0 0 0 0


KeyboardInterrupt: 