In [143]:
# Install packages
!pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [144]:
import pandas as pd
import numpy as np
import opendatasets as od
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tf.__version__

'2.12.0'

In [145]:
# Downloads

# for nltk
nltk.download('punkt')

# Dataset
# {"username":"noahagonzo","key":"b7e4d1aedc1148c648f8fcef1ab58905"}
od.download("https://www.kaggle.com/datasets/tovarischsukhov/southparklines")

# stopwords
nltk.download('stopwords')


Skipping, found downloaded files in "./southparklines" (use force=True to force download)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [146]:
# Import training data
# TODO: Remove this to get all data ----------------------------------------------------------------------------------------------------------------
df = pd.read_csv('southparklines/All-seasons.csv', header=0, encoding='utf-8')

print(df.shape)
print(df.head())

(70896, 4)
  Season Episode Character                                               Line
0     10       1      Stan         You guys, you guys! Chef is going away. \n
1     10       1      Kyle                        Going away? For how long?\n
2     10       1      Stan                                         Forever.\n
3     10       1      Chef                                  I'm sorry boys.\n
4     10       1      Stan  Chef said he's been bored, so he joining a gro...


In [147]:
# Define a function for preprocessing text to remove special characters and alter format of expressive words
def preprocess_text(text):
  text = text.lower()

  # Modify text
  text = text.replace("\n","")
  punc = [".", "!", "?", ","] # Remove punctuation that messes up regex
  for p in punc:
    text = re.sub("[" + p + "]", " " + p, text)
  text = re.sub(r"o[h]{2,}", "oh", text)
  text = re.sub(r"a[h]{2,}", "oh", text)

  # Tokenize to remove stopwords and punctuation
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english')) # TODO: try removing stopwords and keeping punctionation
  tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation] 

  return text


In [148]:
# Preprocess text
text = []

for line in df.Line:
  text.append(preprocess_text(line))


In [149]:
# Create a dictionary for frequencies
vocab = {}
for line in text:
  for word in line.split():
    if word not in vocab:
      vocab[word] = 1
    else:
      vocab[word] += 1

print("Size of vocab:", len(vocab))
print(vocab)

Size of vocab: 33607


In [150]:
# MAYBE: SEt a threshold so only words that are used more than 3 times are included in the vocab


In [151]:
# Create tokenizer
tokenizer = Tokenizer(num_words=len(vocab), oov_token="<OOV>") #num_word = max number of words to keep, based on word frequency
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [153]:
# Create sequences
maxlen=300
sequences = tokenizer.texts_to_sequences(text)
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding="post")

In [154]:
# Create training data
x_train = padded_sequences
y_train = padded_sequences

In [155]:
# Build model
model = models.Sequential()
model.add(layers.Input(shape=(maxlen,)))
model.add(layers.Dense(maxlen, input_dim=len(vocab), kernel_initializer='normal', activation='relu'))
model.add(layers.Dense(maxlen, input_dim=len(vocab), kernel_initializer='normal', activation='relu'))

In [156]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [157]:
# Train model
history = model.fit(x_train, y_train,batch_size=100,epochs=10,verbose=1, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [159]:
# Define function to predict answer
def predict_answer(model, tokenizer, question):
    # Preprocess question
    question = preprocess_text(question)
    # Convert question to sequence
    sequence = tokenizer.texts_to_sequences([question])
    # Pad sequence
    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding="post", truncating="post")
    # Predict answer
    pred = model.predict(padded_sequence)[0]
    # Get index of highest probability
    idx = np.argmax(pred)
    # Get answer
    answer = tokenizer.index_word[idx]
    return answer

# Start chatbot
while True:
    question = input('You: ')
    answer = predict_answer(model, tokenizer, question)
    print('Chatbot:', answer)

You: Hi
Chatbot: it
You: no
Chatbot: it
You: Cartman
Chatbot: it
You: Fat
Chatbot: it
You: mecha
Chatbot: it


KeyboardInterrupt: ignored