# NLP

In [1]:
import pandas as pd
import numpy as np
import tensorflow.compat.v1 as tf
import re
import time
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

### Load Data

In [2]:
# Load the data
with open('datasets/cornell/movie_lines.tsv', encoding='utf-8', errors='ignore') as f:
    lines = [line.strip().split('\t') for line in f]
with open('datasets/cornell/movie_conversations.tsv', encoding='utf-8', errors='ignore') as f:
    conv_lines = [line.strip().split('\t') for line in f]

# Convert the fourth field into a list
for line in conv_lines:
    line[3] = line[3][1:-1].replace("'", "").split()

In [3]:
# The sentences that we will be using to train our model.
lines[:5]

[['L1045', 'u0', 'm0', 'BIANCA', 'They do not!'],
 ['L1044', 'u2', 'm0', 'CAMERON', 'They do to!'],
 ['L985', 'u0', 'm0', 'BIANCA', 'I hope so.'],
 ['L984', 'u2', 'm0', 'CAMERON', 'She okay?'],
 ['L925', 'u0', 'm0', 'BIANCA', "Let's go."]]

In [4]:
# The sentences' ids, which will be processed to become our input and target data.
conv_lines[:5]

[['u0', 'u2', 'm0', ['L194', 'L195', 'L196', 'L197']],
 ['u0', 'u2', 'm0', ['L198', 'L199']],
 ['u0', 'u2', 'm0', ['L200', 'L201', 'L202', 'L203']],
 ['u0', 'u2', 'm0', ['L204', 'L205', 'L206']],
 ['u0', 'u2', 'm0', ['L207', 'L208']]]

In [5]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    if len(line) == 5:
        id2line[line[0]] = line[4]

convs = [line[3] for line in conv_lines]

convs[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [6]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []

for conv in convs:
    for i in range(len(conv)-1):
        if conv[i] in id2line and conv[i+1] in id2line:
            questions.append(id2line[conv[i]])
            answers.append(id2line[conv[i+1]])


# Check if we have loaded the data correctly
limit = 0
for i in range(limit, limit+5):
    print(questions[i])
    print(answers[i])
    print()

# Compare lengths of questions and answers
print("Questions Length:", len(questions), "| Answers Length:", len(answers))

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well I thought we'd start with pronunciation if that's okay with you.

Well I thought we'd start with pronunciation if that's okay with you.
Not the hacking and gagging and spitting part.  Please.

Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

You're asking me out.  That's so cute. What's your name again?
Forget it.

No no it's my fault -- we didn't have a proper introduction ---
Cameron.

Questions Length: 215238 | Answers Length: 215238


In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [8]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))


# Take a look at some of the data to ensure that it has been cleaned well.
limit = 0
for i in range(limit, limit+5):
    print(clean_questions[i])
    print(clean_answers[i])
    print()

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you

well i thought we would start with pronunciation if that is okay with you
not the hacking and gagging and spitting part  please

not the hacking and gagging and spitting part  please
okay then how about we try out some french cuisine  saturday  night

you are asking me out  that is so cute that is your name again
forget it

no no it is my fault  we did not have a proper introduction 
cameron



## Tokenizing

In [9]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_questions + clean_answers)

# Convert the text to sequences of integers
questions_seq = tokenizer.texts_to_sequences(clean_questions)
answers_seq = tokenizer.texts_to_sequences(clean_answers)

# Pad the sequences so they're all the same length
questions_seq = pad_sequences(questions_seq)
answers_seq = pad_sequences(answers_seq)

# Split the data into a training set and a validation set
questions_train, questions_val, answers_train, answers_val = train_test_split(
    questions_seq, answers_seq, test_size=0.2)

# Model

### Hyper-parameters

In [10]:
# Define the parameters of your model
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000
num_tokens = len(tokenizer.word_index) + 1  # number of unique tokens in your text data

In [11]:
# Define the encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

2024-04-08 12:30:53.968948: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-08 12:30:53.969439: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-08 12:30:53.970085: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [None]:
# Shift the answers to the right by one timestep
answers_output = np.zeros_like(answers_train)
answers_output[:, 1:] = answers_train[:, :-1]
answers_output[:, 0] = tokenizer.word_index['start']

# Add an extra dimension to the end of the array
answers_output = np.expand_dims(answers_output, -1)

In [14]:
model.fit([questions_train, answers_train], answers_output, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/100


2024-04-08 12:26:35.709137: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-04-08 12:26:35.819371: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-08 12:26:35.819890: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-08 12:26:35.820368: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN

ValueError: in user code:

    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/losses.py", line 1984, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/ryanbussert/Jupyter/data-playground/.conda/lib/python3.11/site-packages/keras/backend.py", line 5559, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 498) and (None, 498, 66002) are incompatible


## Chat

In [None]:
def chat_with_model(user_input, enc_model, dec_model, tokenizer, max_len):
    # Tokenize the user input
    user_input_seq = tokenizer.texts_to_sequences([user_input])
    user_input_seq = pad_sequences(user_input_seq, maxlen=max_len)

    # Use the encoder model to get the states
    states_value = enc_model.predict(user_input_seq)

    # Generate an empty target sequence
    target_seq = np.zeros((1, 1))

    # Populate the first character of the target sequence with the start character
    target_seq[0, 0] = tokenizer.word_index['start']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = dec_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character
        if (sampled_char == 'end' or len(decoded_sentence) > max_len):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
user_input = "Hello"
response = chat_with_model(user_input, enc_model, dec_model, tokenizer, max_len)
print("Chatbot:", response)