In [200]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import re

In [201]:
dataset = pd.read_csv('Conversation.csv')

In [202]:
dataset

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,"hi, how are you doing?",i'm fine. how about yourself?
1,1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,3,no problem. so how have you been?,i've been great. what about you?
4,4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...,...
3720,3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,3721,are you right-handed?,yes. all my life.
3722,3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [203]:
dataset = dataset.drop(columns=['Unnamed: 0'])
questions = dataset['question']
answers = dataset['answer']

In [204]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bisto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bisto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bisto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [205]:
def preprocess(data):
    if isinstance(data, (list, tuple)):  # If data is a list or tuple, join into a single string
        data = ' '.join(data)
    elif hasattr(data, '__iter__') and not isinstance(data, (str, bytes)):  # If data is an iterable but not a string or bytes
        data = ' '.join(list(data))

    
    # Tokenize data
    tokens = nltk.word_tokenize(data)
    
    # Lowercase all words
    tokens = [word.lower() for word in tokens]
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [206]:
def preprocess_answers(data):
    if isinstance(data, (list, tuple)):  # If data is a list or tuple, join into a single string
        data = ' '.join(data)
    elif hasattr(data, '__iter__') and not isinstance(data, (str, bytes)):  # If data is an iterable but not a string or bytes
        data = ' '.join(list(data))

    
    # Tokenize data
    tokens = nltk.word_tokenize(data)
    
    # Lowercase all words
    tokens = [word.lower() for word in tokens]
    
    # # Remove stopwords and punctuation
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [207]:
data = "This is a sample text, with punctuation! They'll see how it preprocesses this."
print(preprocess(data))

['sample', 'text', 'punctuation', "'ll", 'see', 'preprocesses']


In [208]:
data = "This is a sample text, with punctuation! They'll see how it preprocesses this."
print(preprocess_answers(data))

['this', 'is', 'a', 'sample', 'text', ',', 'with', 'punctuation', '!', 'they', "'ll", 'see', 'how', 'it', 'preprocesses', 'this', '.']


In [209]:
processed_questions = [preprocess(question) for question in questions]
processed_answers = [preprocess(answer) for answer in answers]

In [210]:
question_tokenizer = Tokenizer()
question_tokenizer.fit_on_texts(processed_questions)
question_sequences = question_tokenizer.texts_to_sequences(processed_questions)

In [211]:
answer_tokenizer = Tokenizer()
answer_tokenizer.fit_on_texts(processed_answers)
answer_sequences = answer_tokenizer.texts_to_sequences(processed_answers)

In [212]:
question_word_index = question_tokenizer.word_index
question_vocab_size = len(question_word_index) + 1
answer_word_index = answer_tokenizer.word_index
answer_vocab_size = len(answer_word_index) + 1

In [213]:
max_question_len = max(len(seq) for seq in question_sequences)
max_answer_len = max(len(seq) for seq in answer_sequences)

In [214]:
questions_pad = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')
answers_pad = pad_sequences(answer_sequences, maxlen=max_answer_len, padding='post')

In [215]:
print("Question Vocabulary Size:", question_vocab_size)
print("Answer Vocabulary Size:", answer_vocab_size)
print("Max Question Length:", max_question_len)
print("Max Answer Length:", max_answer_len)

Question Vocabulary Size: 2009
Answer Vocabulary Size: 2064
Max Question Length: 12
Max Answer Length: 12


In [216]:
from tensorflow.keras.layers import Reshape
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=question_vocab_size, output_dim=128, input_length=max_question_len),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(answer_vocab_size, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [217]:
print(questions_pad.shape)
print(answers_pad.shape)

(3725, 12)
(3725, 12)


In [218]:
history = model.fit(questions_pad, answers_pad, epochs=50, batch_size=32)

Epoch 1/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6871 - loss: 4.6258
Epoch 2/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7260 - loss: 2.2799
Epoch 3/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.7252 - loss: 2.1644
Epoch 4/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7247 - loss: 2.0826
Epoch 5/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7279 - loss: 2.0115
Epoch 6/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7254 - loss: 1.9899
Epoch 7/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7255 - loss: 1.9727
Epoch 8/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7304 - loss: 1.9157
Epoch 9/50
[1m117/117[0m [32m

In [219]:
def generate_response(input_text):
    # Tokenize input text using question tokenizer
    input_seq = question_tokenizer.texts_to_sequences([preprocess(input_text)])
    
    # Pad sequences
    input_seq = pad_sequences(input_seq, maxlen=max_question_len, padding='post')
    
    # Generate model prediction
    output_seq = model.predict(input_seq)[0]
    
    # Convert predicted sequence to text using answer tokenizer
    predicted_words = []
    for token in output_seq:
        predicted_word_index = np.argmax(token)
        predicted_word = answer_tokenizer.index_word.get(predicted_word_index, '')
        if predicted_word:
            predicted_words.append(predicted_word)
    
    # Join predicted words to form the response
    response = ' '.join(predicted_words)
    return response

In [220]:
input_text = "are you right-handed?"
response = generate_response(input_text)
print("Generated Response:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
Generated Response: yes life
