**Data preprocessing**

In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/topical_chat.csv')

In [None]:
data

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


In [None]:
# Group messages by conversation ID
grouped_data = data.groupby('conversation_id')['message'].apply(list)

# Prepare pairs of questions and answers for each conversation
conversations = []
for _, messages in grouped_data.items():
    questions = messages[::2]  # Assuming questions are at even indices
    answers = messages[1::2]   # Assuming answers are at odd indices

    for question, answer in zip(questions, answers):
        conversations.append({'question': question, 'answer': answer})

# Print the first few conversations for verification
for i, conv in enumerate(conversations):
    print(f"Conversation {i + 1}:")
    print(f"Question: {conv['question']}")
    print(f"Answer: {conv['answer']}\n")
    if i >= 50:
        break

Conversation 1:
Question:  Are you a fan of Google or Microsoft?
Answer:  Both are excellent technology they are helpful in many ways. For the security purpose both are super.

Conversation 2:
Question:  I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. 
Answer:  Google provides online related services and products, which includes online ads, search engine and cloud computing.

Conversation 3:
Question:  Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. 
Answer:  Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.

Conversation 4:
Question:  Did you know Google had hundreds of live goats to cut the grass in the past? 
Answer:  It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015. 

Conversation 5:
Question:  I like Google Chrome. 

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary resources if not already downloaded
nltk.download('punkt')

# Define a function to clean and preprocess the text
def clean_and_preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Join tokens back into a cleaned sentence
    cleaned_text = ' '.join(tokens)

    return cleaned_text

# Clean and preprocess the questions and answers in conversations
cleaned_conversations = []
for conv in conversations:
    cleaned_question = clean_and_preprocess(conv['question'])
    cleaned_answer = clean_and_preprocess(conv['answer'])
    cleaned_conversations.append({'question': cleaned_question, 'answer': cleaned_answer})

# Print the first few cleaned conversations for verification
for i, conv in enumerate(cleaned_conversations):
    print(f"Conversation {i + 1}:")
    print(f"Cleaned Question: {conv['question']}")
    print(f"Cleaned Answer: {conv['answer']}\n")
    if i >= 4:  # Print the first 5 cleaned conversations
     break

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Conversation 1:
Cleaned Question: are you a fan of google or microsoft ?
Cleaned Answer: both are excellent technology they are helpful in many ways . for the security purpose both are super .

Conversation 2:
Cleaned Question: i 'm not a huge fan of google , but i use it a lot because i have to . i think they are a monopoly in some sense .
Cleaned Answer: google provides online related services and products , which includes online ads , search engine and cloud computing .

Conversation 3:
Cleaned Question: yeah , their services are good . i 'm just not a fan of intrusive they can be on our personal lives .
Cleaned Answer: google is leading the alphabet subsidiary and will continue to be the umbrella company for alphabet internet interest .

Conversation 4:
Cleaned Question: did you know google had hundreds of live goats to cut the grass in the past ?
Cleaned Answer: it is very interesting . google provide `` chrome os '' which is a light weight os . google provided a lot of hardware m

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Combine all cleaned questions and answers into separate lists
questions = [conv['question'] for conv in cleaned_conversations]
answers = [conv['answer'] for conv in cleaned_conversations]

# Initialize tokenizers for questions and answers
tokenizer_ques = Tokenizer()  # Use <OOV> for out-of-vocabulary words
tokenizer_ans = Tokenizer()  # Use <OOV> for out-of-vocabulary words

# Fit the tokenizers on the text for questions and answers
tokenizer_ques.fit_on_texts(questions)
tokenizer_ans.fit_on_texts(answers)

# Add '<start>' and '<end>' tokens to the tokenizers' word_index for answers
tokenizer_ans.word_index['<start>'] = len(tokenizer_ans.word_index) + 1
tokenizer_ans.word_index['<end>'] = len(tokenizer_ans.word_index) + 2

# Save the tokenizers to files
with open('tokenizer_ques.pkl', 'wb') as tokenizer_ques_file:
    pickle.dump(tokenizer_ques, tokenizer_ques_file)
with open('tokenizer_ans.pkl', 'wb') as tokenizer_ans_file:
    pickle.dump(tokenizer_ans, tokenizer_ans_file)

# Convert text to sequences of word indices for questions and answers
sequences_ques = tokenizer_ques.texts_to_sequences(questions)
sequences_ans = tokenizer_ans.texts_to_sequences(answers)

# Find the maximum sequence length for questions and answers separately
max_seq_length_ques = max(len(seq) for seq in sequences_ques)
max_seq_length_ans = max(len(seq) for seq in sequences_ans)

# Pad sequences to make them of the same length for questions and answers
padded_sequences_ques = pad_sequences(sequences_ques, maxlen=max_seq_length_ques, padding='post', truncating='post')
padded_sequences_ans = pad_sequences(sequences_ans, maxlen=max_seq_length_ans, padding='post', truncating='post')

# Create input-output pairs for the encoder-decoder model for questions and answers
input_data_ques = padded_sequences_ques[:, :-1]  # Input is the question (remove the last token)
output_data_ans = padded_sequences_ans[:, 1:]   # Output is the answer (remove the first token)

# Convert input and output sequences to numpy arrays for questions and answers
input_data_ques = np.array(input_data_ques)
output_data_ans = np.array(output_data_ans)

# Print the vocabulary size for questions and answers separately
vocab_size_ques = len(tokenizer_ques.word_index)
vocab_size_ans = len(tokenizer_ans.word_index)
print(f"Vocabulary size for questions: {vocab_size_ques}")
print(f"Vocabulary size for answers: {vocab_size_ans}")

Vocabulary size for questions: 27213
Vocabulary size for answers: 28171


In [None]:
max_seq_length_ques

130

In [None]:
max_seq_length_ans

132

In [None]:
import pickle

# Create a dictionary to store the preprocessed data
preprocessed_data = {
    'input_data': input_data_ques,
    'output_data': output_data_ans,
    'tokenizer_ques': tokenizer_ques,  # Use tokenizer_ques
    'tokenizer_ans': tokenizer_ans,    # Use tokenizer_ans
    'max_seq_length_ques': max_seq_length_ques,  # Update variable names
    'max_seq_length_ans': max_seq_length_ans,    # Update variable names
    'vocab_size_ques': vocab_size_ques,  # Update variable names
    'vocab_size_ans': vocab_size_ans     # Update variable names
}

# Save the preprocessed data to a file using Pickle
with open('preprocessed_data.pkl', 'wb') as file:
    pickle.dump(preprocessed_data, file)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_output_train, decoder_output_val = train_test_split(
    input_data_ques, output_data_ans, output_data_ans, test_size=0.2, random_state=42
)

# Print the shapes of the training and validation sets
print("Shapes of training data:")
print(f"Encoder input: {encoder_input_train.shape}")
print(f"Decoder input: {decoder_input_train.shape}")
print(f"Decoder output: {decoder_output_train.shape}")

print("\nShapes of validation data:")
print(f"Encoder input: {encoder_input_val.shape}")
print(f"Decoder input: {decoder_input_val.shape}")
print(f"Decoder output: {decoder_output_val.shape}")


Shapes of training data:
Encoder input: (72939, 129)
Decoder input: (72939, 131)
Decoder output: (72939, 131)

Shapes of validation data:
Encoder input: (18235, 129)
Decoder input: (18235, 131)
Decoder output: (18235, 131)


In [None]:
import numpy as np
import tensorflow as tf

# Define a function for batch one-hot encoding
def batch_one_hot_encode(sequences, vocab_size):
    batch_size = 32  # Adjust this batch size as needed
    num_batches = len(sequences) // batch_size
    encoded_sequences = []

    for i in range(num_batches):
        batch = sequences[i * batch_size : (i + 1) * batch_size]
        encoded_batch = tf.one_hot(batch, depth=vocab_size)
        encoded_sequences.append(encoded_batch)

    # Handle the remaining sequences (if any)
    remaining = len(sequences) % batch_size
    if remaining > 0:
        batch = sequences[-remaining:]
        encoded_batch = tf.one_hot(batch, depth=vocab_size)
        encoded_sequences.append(encoded_batch)

    return tf.concat(encoded_sequences, axis=0)

# Example usage
decoder_output_train_encoded = batch_one_hot_encode(output_data_ans, vocab_size_ans)

In [None]:
import pickle
import numpy as np

# Load the tokenizers
with open('tokenizer_ques.pkl', 'rb') as tokenizer_ques_file:
    tokenizer_ques = pickle.load(tokenizer_ques_file)

with open('tokenizer_ans.pkl', 'rb') as tokenizer_ans_file:
    tokenizer_ans = pickle.load(tokenizer_ans_file)

# Load your sequences or data as needed
# For example:
# sequences_ques = load_sequences('encoder_sequences.pkl')
# sequences_ans = load_sequences('decoder_sequences.pkl')

# Verify the shapes of your data
print("Shapes of your data:")
print("Encoder input:", np.array(sequences_ques).shape)
print("Decoder input:", np.array(sequences_ans).shape)

# Sample inspection
for i in range(5):  # Print the first 5 samples
    print("\nSample", i + 1)
    print("Encoder Input:", tokenizer_ques.sequences_to_texts([sequences_ques[i]]))
    print("Decoder Input:", tokenizer_ans.sequences_to_texts([sequences_ans[i]]))


Shapes of your data:
Encoder input: (91174,)
Decoder input: (91174,)

Sample 1
Encoder Input: ['are you a fan of google or microsoft']
Decoder Input: ['both are excellent technology they are helpful in many ways for the security purpose both are super']

Sample 2
Encoder Input: ["i 'm not a huge fan of google but i use it a lot because i have to i think they are a monopoly in some sense"]
Decoder Input: ['google provides online related services and products which includes online ads search engine and cloud computing']

Sample 3
Encoder Input: ["yeah their services are good i 'm just not a fan of intrusive they can be on our personal lives"]
Decoder Input: ['google is leading the alphabet subsidiary and will continue to be the umbrella company for alphabet internet interest']

Sample 4
Encoder Input: ['did you know google had hundreds of live goats to cut the grass in the past']
Decoder Input: ["it is very interesting google provide chrome os '' which is a light weight os google provide

  print("Encoder input:", np.array(sequences_ques).shape)
  print("Decoder input:", np.array(sequences_ans).shape)


In [None]:
class MyDataset(tf.keras.utils.Sequence):
    def __init__(self, encoder_input, decoder_input, decoder_output, tknizer_ques, tknizer_ans, max_len):
        self.encoder_input = encoder_input.tolist()
        self.decoder_input = decoder_input.tolist()
        self.decoder_output = decoder_output.tolist()
        self.tknizer_ques = tknizer_ques
        self.tknizer_ans = tknizer_ans
        self.max_len = max_len

    def __len__(self):
        return len(self.encoder_input)

    def __getitem__(self, i):
        encoder_input = self.encoder_input[i]
        decoder_input = self.decoder_input[i]
        decoder_output = self.decoder_output[i]

        # Ensure the inputs are strings
        encoder_input = str(encoder_input)
        decoder_input = str(decoder_input)
        decoder_output = str(decoder_output)

        encoder_seq = self.tknizer_ques.texts_to_sequences([encoder_input])[0]
        decoder_inp_seq = self.tknizer_ans.texts_to_sequences([decoder_input])[0]
        decoder_out_seq = self.tknizer_ans.texts_to_sequences([decoder_output])[0]

        encoder_seq = pad_sequences([encoder_seq], maxlen=self.max_len, padding='post')[0]
        decoder_inp_seq = pad_sequences([decoder_inp_seq], maxlen=self.max_len, padding='post')[0]
        decoder_out_seq = pad_sequences([decoder_out_seq], maxlen=self.max_len, padding='post')[0]

        return [encoder_seq, decoder_inp_seq], decoder_out_seq

    def on_epoch_end(self):
        indices = np.arange(len(self.encoder_input))
        np.random.shuffle(indices)
        self.encoder_input = [self.encoder_input[i] for i in indices]
        self.decoder_input = [self.decoder_input[i] for i in indices]
        self.decoder_output = [self.decoder_output[i] for i in indices]

class Dataloder(tf.keras.utils.Sequence):
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset))

    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        encoder_seqs = np.stack([item[0][0] for item in data], axis=0)
        decoder_inp_seqs = np.stack([item[0][1] for item in data], axis=0)
        decoder_out_seqs = np.stack([item[1] for item in data], axis=0)

        return [encoder_seqs, decoder_inp_seqs], decoder_out_seqs

    def __len__(self):
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [None]:
class Encoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.vocab_size = inp_vocab_size
        self.embedding_dim = embedding_size
        self.input_length = input_length
        self.lstm_size = lstm_size
        self.lstm_output = 0
        self.lstm_state_h=0
        self.lstm_state_c=0

    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim,
                                   input_length=self.input_length,mask_zero=True, name="embedding_layer_encoder",     )
        self.lstm = LSTM(self.lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")


    def call(self,input_sequence, training=True):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- encoder_output, last time step's hidden and cell state
        '''
        input_embedd = self.embedding(input_sequence)
        #states=self.lstm_state_h,self.lstm_state_c
        self.lstm_output, self.lstm_state_h,self.lstm_state_c = self.lstm(input_embedd)
        return self.lstm_output, self.lstm_state_h,self.lstm_state_c


    def initialize_states(self,batch_size):
   #   '''
   #   Given a batch size it will return intial hidden state and intial cell state.
   #   If batch size is 32- Hidden state is zeros of size [32,lstm_units], cell state zeros is of size [32,lstm_units]
   #   '''
        #self.batch_size=batch_size
        hidden_state=np.zeros((batch_size,self.lstm_size), dtype=float, order='C')
        cell_state=np.zeros((batch_size, self.lstm_size), dtype=float, order='C')

        return hidden_state, cell_state

In [None]:
class Decoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns output sequence
    '''

    def __init__(self,out_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.vocab_size = out_vocab_size
        self.embedding_size = embedding_size
        self.lstm_size = lstm_size
        self.input_length = input_length

        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size,
                                   input_length=self.input_length,mask_zero=True,
                                   name="embedding_layer_decoder", trainable=False)
        self.lstm = LSTM(self.lstm_size, return_sequences=True, return_state=True, name="Encoder_LSTM")


    def call(self,input_sequence,states):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to decoder_lstm

          returns -- decoder_output,decoder_final_state_h,decoder_final_state_c
        '''
        state_h = states[0]
        state_c = states[1]
        embedd = self.embedding(input_sequence)
        lstm_output,final_state_h,final_state_c = self.lstm(embedd, initial_state=[state_h, state_c])

        return lstm_output, final_state_h, final_state_c

In [None]:
class Encoder_decoder(tf.keras.Model):

    def __init__(self,encoder_inputs_length,decoder_inputs_length, output_vocab_size):
        super().__init__()
        #Create encoder object
        self.encoder = Encoder(inp_vocab_size=vocab_size_ques+1,embedding_size =100,
                               lstm_size =512,input_length=encoder_inputs_length)
        #Create decoder object
        self.decoder = Decoder(vocab_size_ans+1,embedding_size=100,
                               lstm_size =512,input_length = decoder_inputs_length)
        #Intialize Dense layer(out_vocab_size) with activation='softmax'
        self.Dense = Dense(output_vocab_size, activation='softmax')

    def call(self, data):

        input, output = data[0], data[1]
        encoder_output, encoder_h, encoder_c = self.encoder(input)
        decode,_,_ = self.decoder(output, [encoder_h, encoder_c])
        decoder_ouputs = self.Dense(decode)

        return decoder_ouputs

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
# Compile the model
model = Encoder_decoder(encoder_inputs_length=max_seq_length_ques-1, decoder_inputs_length=max_seq_length_ans-1, output_vocab_size=vocab_size_ans+1)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create training and validation datasets using the custom data generators
train_dataset = MyDataset(encoder_input_train, decoder_input_train, decoder_output_train, tokenizer_ques, tokenizer_ans, max_seq_length_ques-1)
val_dataset = MyDataset(encoder_input_val, decoder_input_val, decoder_output_val, tokenizer_ques, tokenizer_ans, max_seq_length_ques-1)

train_dataloader = Dataloder(train_dataset, batch_size=32)
val_dataloader = Dataloder(val_dataset, batch_size=32)



In [None]:
batch_size = 32
# Create a dummy input tensor with the shape that your model expects
dummy_input = [np.zeros((batch_size, max_seq_length_ques-1)), np.zeros((batch_size, max_seq_length_ans-1))]
dummy_output = model(dummy_input)

# Print the shape of the model's output
print("Shape of model output:", dummy_output.shape)

Shape of model output: (32, 131, 28172)


In [None]:
print("Shapes of training data:")
print(f"Encoder input: {encoder_input_train.shape}")
print(f"Decoder input: {decoder_input_train.shape}")
print(f"Decoder output: {decoder_output_train.shape}")

Shapes of training data:
Encoder input: (72939, 129)
Decoder input: (72939, 131)
Decoder output: (72939, 131)


In [None]:
# Loop through the first 5 batches of training data
for batch_index in range(5):
    encoder_input_batch, decoder_output_batch = train_dataloader[batch_index]

    # Decode the tokenized sequences back to text for display
    decoded_encoder_input = tokenizer_ques.sequences_to_texts(encoder_input_batch[0])
    decoded_decoder_output = tokenizer_ans.sequences_to_texts(decoder_output_batch)

    # Print the decoded samples
    print(f"Batch {batch_index + 1} samples:")
    for i in range(min(5, len(decoded_encoder_input))):
        print("Sample", i + 1)
        print("Encoder Input:", decoded_encoder_input[i])
        print("Decoder Output:", decoded_decoder_output[i])
        print()

Batch 1 samples:
Sample 1
Encoder Input: 7 16 47 1 32 6 1 86 24 390 30 1 86 68 35 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Decoder Output: 6 16 120 38 14 23 93 5 304 1908 256 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Sample 2
Encoder Input: 7 105 23 1 51 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Decoder Output: 7 2 15 11 21 11 1876 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
from tensorflow.keras.layers import Dense

# Check the number of units in the output layer of your model
output_units = model.layers[-1].units  # Get the number of units in the last layer

# Ensure it matches the vocabulary size
if output_units != vocab_size_ans + 1:
    # If not, create a new output layer with the correct number of units
    new_output_layer = Dense(vocab_size_ans + 1, activation='softmax', name='output_layer')

    # Replace the old output layer with the new one
    model.layers[-1] = new_output_layer
    model.outputs = [new_output_layer.output]

# Recompile the model with the correct output layer
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
print("Shapes of training data:")
print(f"Encoder input: {encoder_input_train.shape}")
print(f"Decoder input: {decoder_input_train.shape}")
print(f"Decoder output: {decoder_output_train.shape}")

Shapes of training data:
Encoder input: (72939, 129)
Decoder input: (72939, 131)
Decoder output: (72939, 131)


In [None]:
# Train the model
history = model.fit(train_dataloader, epochs=10, validation_data=val_dataloader)

Epoch 1/10


ValueError: ignored