In [18]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras 
from tqdm import tqdm
from keras.layers import Dense
import json 
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import unicodedata
from sklearn.model_selection import train_test_split

In [21]:
question  =[]
answer = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        question.append(line[0])
        answer.append(line[1])
print(len(question) == len(answer))

True


In [22]:
question[:5]

['hi, how are you doing?',
 "i'm fine. how about yourself?",
 "i'm pretty good. thanks for asking.",
 'no problem. so how have you been?',
 "i've been great. what about you?"]

In [23]:
answer[:5]

["i'm fine. how about yourself?\n",
 "i'm pretty good. thanks for asking.\n",
 'no problem. so how have you been?\n',
 "i've been great. what about you?\n",
 "i've been good. i'm in school right now.\n"]

In [24]:
data = pd.DataFrame({"question" : question ,"answer":answer})
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?\n
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.\n
2,i'm pretty good. thanks for asking.,no problem. so how have you been?\n
3,no problem. so how have you been?,i've been great. what about you?\n
4,i've been great. what about you?,i've been good. i'm in school right now.\n


In [25]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

In [26]:
def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    text =  "<sos> " +  text + " <eos>"
    return text

In [27]:
data["question"][0]

'hi, how are you doing?'

In [28]:
data["question"] = data.question.apply(clean_text)

In [29]:
data["question"][0]

'<sos> hi how are you doing <eos>'

In [40]:
data["answer"] = data.answer.apply(clean_text)

In [42]:
data["answer"] = data.answer.apply(clean_text)

In [44]:
question  = data.question.values.tolist()
answer =  data.answer.values.tolist()

In [46]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

In [48]:
input_tensor , inp_lang  =  tokenize(question)

In [50]:
target_tensor , targ_lang  =  tokenize(answer)

In [52]:
#len(inp_question) ==  len(inp_answer)

In [54]:
def remove_tags(sentence):
    return sentence.split("<start>")[-1].split("<end>")[0]

In [56]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [58]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [60]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 22]), TensorShape([64, 24]))

In [62]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x,hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [64]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 22, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [65]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [68]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 22, 1)


In [70]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)
        return x, state, attention_weights

In [72]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 2359)


In [74]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [76]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<sos>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)
             # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [78]:
questions  =[]
answers = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        questions.append(line[0])
        answers.append(line[1])
print(len(question) == len(answer))

True


In [80]:
EPOCHS = 20

for epoch in tqdm(range(1, EPOCHS + 1), desc='Epochs', unit='epoch'):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if epoch % 4 == 0:
        print('Epoch:{:3d} Loss:{:.4f}'.format(epoch, total_loss / steps_per_epoch))

Epochs:  20%|█████████████▊                                                       | 4/20 [21:50<1:25:34, 320.89s/epoch]

Epoch:  4 Loss:1.4320


Epochs:  40%|███████████████████████████▌                                         | 8/20 [44:15<1:06:33, 332.75s/epoch]

Epoch:  8 Loss:1.2040


Epochs:  60%|████████████████████████████████████████▊                           | 12/20 [1:04:04<40:43, 305.40s/epoch]

Epoch: 12 Loss:1.0161


Epochs:  80%|██████████████████████████████████████████████████████▍             | 16/20 [1:25:51<21:09, 317.30s/epoch]

Epoch: 16 Loss:0.8192


Epochs: 100%|████████████████████████████████████████████████████████████████████| 20/20 [1:47:01<00:00, 321.09s/epoch]

Epoch: 20 Loss:0.6094





In [88]:
def evaluate(sentence):
    sentence = clean_text(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<sos>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<eos>':
            return remove_tags(result), remove_tags(sentence)
        dec_input = tf.expand_dims([predicted_id], 0)

    return remove_tags(result), remove_tags(sentence)
            

In [90]:
questions  =[]
answers = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        questions.append(line[0])
        answers.append(line[1])
print(len(question) == len(answer))

True


In [92]:
def ask(sentence):
    result, sentence = evaluate(sentence)

    print('Question: %s' % (sentence))
    print('Predicted answer: {}'.format(result))
ask(questions[100])

Question: <sos> i believe so <eos>
Predicted answer: sos it is the mirror eos <eos> 


In [94]:
ask(questions[20])

Question: <sos> it is not bad there are a lot of people there <eos>
Predicted answer: sos well you are crying or complainers eos <eos> 


In [96]:
print(answers[20])

good luck with that.



In [98]:
ask(questions[10])

Question: <sos> good luck with school <eos>
Predicted answer: sos thank you very much eos <eos> 


In [13]:
print(answers[10])

thank you very much.



In [104]:
ask(questions[1])

Question: <sos> i am fine how about yourself <eos>
Predicted answer: sos i am not like it like it eos <eos> 


In [106]:
print(answers[1])

i'm pretty good. thanks for asking.



In [108]:
ask(questions[11])

Question: <sos> how is it going <eos>
Predicted answer: sos i am not sure eos <eos> 


In [112]:
print(answers[11])

i'm doing well. how about you?



the model is not trained to optimum level of precision that is the reason the pretraining is not preety successful...but with a heavy duty machines we can achive better accuracy if we increase the EPOCHS to 120...

In [119]:
import joblib

# Assuming you have a trained model called 'my_model'
joblib.dump(EPOCHS, 'my_model.pkl')


['my_model.pkl']

In [121]:
loaded_model = joblib.load('my_model.pkl')


In [123]:
%%capture captured_output
# Your code producing some output


In [125]:
with open('output.txt', 'w') as f:
    f.write(captured_output.stdout)


In [127]:
pip install pandoc

Collecting pandocNote: you may need to restart the kernel to use updated packages.

  Downloading pandoc-2.4.tar.gz (34 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting plumbum (from pandoc)
  Downloading plumbum-1.8.3-py3-none-any.whl.metadata (10 kB)
Downloading plumbum-1.8.3-py3-none-any.whl (127 kB)
   ---------------------------------------- 0.0/127.6 kB ? eta -:--:--
   --------- ----------------------------- 30.7/127.6 kB 660.6 kB/s eta 0:00:01
   ------------------------- ------------- 81.9/127.6 kB 762.6 kB/s eta 0:00:01
   ------------------------------------ - 122.9/127.6 kB 798.9 kB/s eta 0:00:01
   -------------------------------------- 127.6/127.6 kB 752.3 kB/s eta 0:00:00
Building wheels for collected packages: pandoc
  Building wheel for pandoc (setup.py): started
  Building wheel for pandoc (setup.py): finished with status 'done'
  Created wheel for pandoc: filename=pandoc-2.4-py3-none-any.whl size=348

In [129]:
ask(questions[1])

Question: <sos> i am fine how about yourself <eos>
Predicted answer: sos i am not like it like it eos <eos> 


In [131]:
print(answer[1])

<sos> sos i am pretty good thanks for asking eos <eos>


In [135]:
ask (questions[3])

Question: <sos> no problem so how have you been <eos>
Predicted answer: sos i have been working too eos <eos> 


In [139]:
print(answer[1])

<sos> sos i am pretty good thanks for asking eos <eos>


In [141]:
print(answer)

['<sos> sos i am fine how about yourself eos <eos>', '<sos> sos i am pretty good thanks for asking eos <eos>', '<sos> sos no problem so how have you been eos <eos>', '<sos> sos i have been great what about you eos <eos>', '<sos> sos i have been good i am in school right now eos <eos>', '<sos> sos what school do you go to eos <eos>', '<sos> sos i go to pcc eos <eos>', '<sos> sos do you like it there eos <eos>', '<sos> sos it is okay it is a really big campus eos <eos>', '<sos> sos good luck with school eos <eos>', '<sos> sos thank you very much eos <eos>', '<sos> sos i am doing well how about you eos <eos>', '<sos> sos never better thanks eos <eos>', '<sos> sos so how have you been lately eos <eos>', '<sos> sos i have actually been pretty good you eos <eos>', '<sos> sos i am actually in school right now eos <eos>', '<sos> sos which school do you attend eos <eos>', '<sos> sos i am attending pcc right now eos <eos>', '<sos> sos are you enjoying it there eos <eos>', '<sos> sos it is not ba

In [143]:
print(questions)

['hi, how are you doing?', "i'm fine. how about yourself?", "i'm pretty good. thanks for asking.", 'no problem. so how have you been?', "i've been great. what about you?", "i've been good. i'm in school right now.", 'what school do you go to?', 'i go to pcc.', 'do you like it there?', "it's okay. it's a really big campus.", 'good luck with school.', "how's it going?", "i'm doing well. how about you?", 'never better, thanks.', 'so how have you been lately?', "i've actually been pretty good. you?", "i'm actually in school right now.", 'which school do you attend?', "i'm attending pcc right now.", 'are you enjoying it there?', "it's not bad. there are a lot of people there.", 'good luck with that.', 'how are you doing today?', "i'm doing great. what about you?", "i'm absolutely lovely, thank you.", "everything's been good with you?", "i haven't been better. how about yourself?", 'i started school recently.', 'where are you going to school?', "i'm going to pcc.", 'how do you like it so far