# NLP

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, GRU, Dropout
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Set the mixed precision policy
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4090, compute capability 8.9


### Load Data

In [2]:
# Load the data
with open('../datasets/cornell/movie_lines.tsv', encoding='utf-8', errors='ignore') as f:
    lines = [line.strip().split('\t') for line in f]
with open('../datasets/cornell/movie_conversations.tsv', encoding='utf-8', errors='ignore') as f:
    conv_lines = [line.strip().split('\t') for line in f]

# Convert the fourth field into a list
for line in conv_lines:
    line[3] = line[3][1:-1].replace("'", "").split()

In [3]:
# The sentences that we will be using to train our model.
lines[:5]

[['L1045', 'u0', 'm0', 'BIANCA', 'They do not!'],
 ['L1044', 'u2', 'm0', 'CAMERON', 'They do to!'],
 ['L985', 'u0', 'm0', 'BIANCA', 'I hope so.'],
 ['L984', 'u2', 'm0', 'CAMERON', 'She okay?'],
 ['L925', 'u0', 'm0', 'BIANCA', "Let's go."]]

In [4]:
# The sentences' ids, which will be processed to become our input and target data.
conv_lines[:5]

[['u0', 'u2', 'm0', ['L194', 'L195', 'L196', 'L197']],
 ['u0', 'u2', 'm0', ['L198', 'L199']],
 ['u0', 'u2', 'm0', ['L200', 'L201', 'L202', 'L203']],
 ['u0', 'u2', 'm0', ['L204', 'L205', 'L206']],
 ['u0', 'u2', 'm0', ['L207', 'L208']]]

In [5]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    if len(line) == 5:
        id2line[line[0]] = line[4]

convs = [line[3] for line in conv_lines]

convs[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [6]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []

for conv in convs:
    for i in range(len(conv)-1):
        if conv[i] in id2line and conv[i+1] in id2line:
            questions.append(id2line[conv[i]])
            answers.append(id2line[conv[i+1]])


# Check if we have loaded the data correctly
limit = 0
for i in range(limit, limit+5):
    print(questions[i])
    print(answers[i])
    print()

# Compare lengths of questions and answers
print("Questions Length:", len(questions), "| Answers Length:", len(answers))

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well I thought we'd start with pronunciation if that's okay with you.

Well I thought we'd start with pronunciation if that's okay with you.
Not the hacking and gagging and spitting part.  Please.

Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

You're asking me out.  That's so cute. What's your name again?
Forget it.

No no it's my fault -- we didn't have a proper introduction ---
Cameron.

Questions Length: 215238 | Answers Length: 215238


In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [8]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))


# Take a look at some of the data to ensure that it has been cleaned well.
limit = 0
for i in range(limit, limit+5):
    print(clean_questions[i])
    print(clean_answers[i])
    print()

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you

well i thought we would start with pronunciation if that is okay with you
not the hacking and gagging and spitting part  please

not the hacking and gagging and spitting part  please
okay then how about we try out some french cuisine  saturday  night

you are asking me out  that is so cute that is your name again
forget it

no no it is my fault  we did not have a proper introduction 
cameron



## Tokenizing

In [9]:
# Add a Start Token to each answer
start_token = '<start>'
clean_answers_with_start = [start_token + ' ' + answer for answer in clean_answers]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_questions + clean_answers_with_start)

# Convert the text to sequences of integers
questions_seq = tokenizer.texts_to_sequences(clean_questions)
answers_seq_with_start = tokenizer.texts_to_sequences(clean_answers_with_start)

# Define a maximum sequence length
max_length = 50  # Adjust based on your dataset's characteristics

# Pad the sequences so they're all the same length, and limit their length
questions_seq = pad_sequences(questions_seq, maxlen=max_length, truncating='post', padding='post')
answers_seq_with_start = pad_sequences(answers_seq_with_start, maxlen=max_length, truncating='post', padding='post')

# Model

### Hyper-parameters

In [14]:
# Define the parameters of your model
batch_size = 64
epochs = 200
latent_dim = 512
learning_rate = 0.005
num_samples = 10000
dropout_rate = 0.2
l2_lambda = 0.0001
num_tokens = len(tokenizer.word_index) + 1  # number of unique tokens in your text data

### Model

In [15]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_tokens, latent_dim)(encoder_inputs)
encoder_embedding_dropout = Dropout(dropout_rate)(encoder_embedding)  # Dropout after embedding
encoder_lstm = LSTM(latent_dim, return_state=True, kernel_regularizer=l2(l2_lambda), recurrent_regularizer=l2(l2_lambda))
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_dropout)  # Use dropout output for LSTM
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_tokens, latent_dim)(decoder_inputs)
decoder_embedding_dropout = Dropout(dropout_rate)(decoder_embedding)  # Dropout after embedding
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, kernel_regularizer=l2(l2_lambda), recurrent_regularizer=l2(l2_lambda))
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_dropout, initial_state=encoder_states)  # Use dropout output for LSTM
decoder_dense = Dense(num_tokens, activation='softmax', kernel_regularizer=l2(l2_lambda))
decoder_outputs = decoder_dense(decoder_outputs)


# Define the model that turns `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

decoder_target_data = np.zeros_like(answers_seq_with_start)
decoder_target_data[:, :-1] = answers_seq_with_start[:, 1:]

# Add an extra dimension to match the expected shape for sparse_categorical_crossentropy
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Define the optimizer
optimizer = Adam(learning_rate=learning_rate)

# Compile & train the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 512)    33793024    ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 512)    33793024    ['input_4[0][0]']                
                                                                                            

### Callbacks

In [16]:
batches_per_epoch = len(questions_seq) // batch_size
save_freq = 4 * batches_per_epoch  # Set to save the model every 10 epochs

model_checkpoint = ModelCheckpoint(
    filepath='../model/nlp/model_{epoch:02d}.h5',  # Saves the model with the epoch number in the filename
    save_freq=save_freq,              # Sets frequency based on the calculated number of batches
    save_weights_only=False,          # Saves the whole model
    verbose=0                         # Prints out messages when saving the model
)

early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=0, restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001, verbose=0)


callbacks_list = [model_checkpoint, early_stopping, reduce_lr]  # Ensure this is a simple list of callbacks

### Train

In [17]:
model.fit([questions_seq, answers_seq_with_start], decoder_target_data, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[callbacks_list])

Epoch 1/200
Epoch 2/200
Epoch 3/200

KeyboardInterrupt: 

### Decoder Model

In [15]:
# Define the inference encoder model
enc_model = Model(encoder_inputs, encoder_states)

# Define the inference decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inference = Embedding(num_tokens, latent_dim)(decoder_inputs)
decoder_lstm_inference = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inference, state_h_dec, state_c_dec = decoder_lstm_inference(
    decoder_embedding_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]
decoder_outputs_inference = Dense(num_tokens, activation='softmax')(decoder_outputs_inference)

dec_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inference] + decoder_states)


# Set the weights from the trained model to the inference model
dec_model.layers[4].set_weights(model.layers[7].get_weights())  # Transfer weights for LSTM
dec_model.layers[5].set_weights(model.layers[8].get_weights())  # Transfer weights for Dense


dec_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 512)    33793024    ['input_2[0][0]']                
                                                                                                  
 input_3 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 512)]        0           []                               
                                                                                            

## Chat

In [27]:
def chat_with_model(user_input, enc_model, dec_model, tokenizer, max_len):
    # Tokenize the user input
    user_input_seq = tokenizer.texts_to_sequences([user_input])
    user_input_seq = pad_sequences(user_input_seq, maxlen=max_len)

    # Use the encoder model to get the states
    states_value = enc_model.predict(user_input_seq)

    # Generate an empty target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('start', 1)  # Assuming 'start' token is correctly indexed

    # Sampling loop for a batch of sequences
    decoded_sentence = ''
    while True:
        output_tokens, h, c = dec_model.predict([target_seq] + states_value)
        print("Output Tokens (probabilities):", output_tokens[0, -1])  # Detailed probabilities
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, '<UNK>')  # Handle unknown tokens
        print("Sampled Token Index:", sampled_token_index, "Sampled Char:", sampled_char)

        if sampled_char == 'end' or len(decoded_sentence) > max_len:
            break
        if sampled_char != '<UNK>':
            decoded_sentence += ' ' + sampled_char

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

# Usage example:
user_input = "hello"
response = chat_with_model(user_input, enc_model, dec_model, tokenizer, max_length)
print("Chatbot:", response)


Output Tokens (probabilities): [9.8549379e-03 4.1672221e-04 2.3784637e-03 ... 9.2610840e-08 9.2345012e-08
 9.2097345e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tokens (probabilities): [5.0447635e-02 3.5301011e-04 1.3276372e-02 ... 7.4717697e-08 7.4503376e-08
 7.4303635e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tokens (probabilities): [7.0198983e-02 3.3630358e-04 1.9506479e-02 ... 6.6901976e-08 6.6710136e-08
 6.6531349e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tokens (probabilities): [9.6011415e-02 2.8783325e-04 1.9973576e-02 ... 6.1549869e-08 6.1373385e-08
 6.1208958e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tokens (probabilities): [1.6345358e-01 2.2530931e-04 2.1299282e-02 ... 5.2255199e-08 5.2105413e-08
 5.1965767e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tokens (probabilities): [2.7073160e-01 2.0124503e-04 2.0984907e-02 ... 4.3672358e-08 4.3547132e-08
 4.3430465e-08]
Sampled Token Index: 0 Sampled Char: <UNK>
Output Tok

KeyboardInterrupt: 

In [23]:
# Check if 'hello' is in the tokenizer's vocabulary
print("'hello' in tokenizer:", 'hello' in tokenizer.word_index)

# Print the index of 'hello' if it exists
if 'hello' in tokenizer.word_index:
    print("Index of 'hello':", tokenizer.word_index['hello'])

# user_input = "you are asking me out"
# response = chat_with_model(user_input, enc_model, dec_model, tokenizer, max_length)
# print("Chatbot:", response)

'hello' in tokenizer: True
Index of 'hello': 343


## Save Model

In [15]:
model.save('model/chatbot')



INFO:tensorflow:Assets written to: model/chatbot\assets


INFO:tensorflow:Assets written to: model/chatbot\assets


## Load Model

In [14]:

# Specify the path to the saved model file
model_path = 'model/chatbot'

# Load the model
model = tf.keras.models.load_model(model_path)
