<a href="https://colab.research.google.com/github/TrevinWacker/NLP-practice/blob/main/nlp_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter

from sqlalchemy import create_engine

In [None]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


Cornell Movie Dialogue

---

In [None]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'

import warnings
warnings.filterwarnings("ignore")

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))


movies_df = pd.read_sql_query('SELECT * FROM dialogs',con=engine)

engine.dispose()

In [None]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304446 entries, 0 to 304445
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   index    304446 non-null  int64 
 1   dialogs  304446 non-null  object
dtypes: int64(1), object(1)
memory usage: 4.6+ MB


In [None]:
# Limiting due to memory issues
# movies_df = movies_df.loc[:200000]

In [None]:
# Create spacy object to get text from DataFrame
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# This allows for more characters to be stored than what's allowed as default
nlp.max_length = 20000000

# Get quotes out of the dialog feature in the DataFrame
quotes = nlp(" ".join(movies_df.dialogs))

In [None]:
# Initial analysis
print("'quotes' is a {} object.".format(type(quotes)))
print("It is {} tokens long".format(len(quotes)))
print("The first one hundred tokens are '{}'".format(quotes[:100]))
print("The type of each token is {}".format(type(quotes[0])))

'quotes' is a <class 'spacy.tokens.doc.Doc'> object.
It is 2784977 tokens long
The first one hundred tokens are 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you. Not the hacking and gagging and spitting part.  Please. Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? You're asking me out.  That's so cute. What's your name again? Forget it. No, no, it'
The type of each token is <class 'spacy.tokens.token.Token'>


In [None]:
# Get sentences from quotes to act as training data
quotes_sents = [sent.text for sent in quotes.sents if len(sent.text) > 1]

quotes_sents[0:5]

['Can we make this quick?',
 ' Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.',
 ' Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.']

# Develop a chatbot using this corpus. In doing this, you're free to choose a chatbot development library like ChatterBot or write your own code from scratch.

---

I'm going to do both for the sake of practice, starting with the self made one

In [None]:
# Starting with greetings

greetings_input = ["hello", "hi", "greetings", "sup", "yo dawg", "oh hi", "hola"]
greetings_output = ["hello to you", "oh hi", "hi, thanks for chatting", "Bonjour! (Practicing my French)", "greetings"]

def greeting(user_input):
  for word in user_input.split():
    if word.lower() in greetings_input:
      return random.choice(greetings_output)

In [None]:
def user_input(user_text):

  # Establish a response as a string
  response = ""

  # Apply spaCy to user_input and add sentences to the larger corpus
  user_response = nlp(user_text) #Defined later
  user_sents = [sent.text for sent in user_response.sents]

  # Add sentences to larger corpus
  for sent in user_sents:
    quotes_sents.append(sent)

  # Vectorize new complete corpus and transform
  vectorizer = TfidfVectorizer()
  new_vec = vectorizer.fit_transform(quotes_sents)
  # Remove user input, to avoid it becoming a response down the line
  quotes_sents.pop(-1)

  # Calculate cosine similarity between user input & other sentences
  cosine_similarities = cosine_similarity(new_vec[-1],new_vec[:-1])

  # Get an index of the most similar sentence to be used as a response
  indx = np.argmax(cosine_similarities)
  response = "" + quotes_sents[indx]
  return response

In [None]:
# Create chatbot object
print("Chatbot: Bonjour!  Thanks for chatting with me.")

while(True):

  user_text = input("User: ")
  # Turn into lowercase to avoid mismatch
  user_text = user_text.lower()

  goodbye_input = ["bye", "i'm leaving", "see ya", "a demain", "i have to go", "goodbye"]
  goodbye_output = ["ok, goodbye", "goodbye, come back soon", "goodbyeeeeeeee"]

  greeting(user_text)

  if user_text not in goodbye_input:

    if greeting(user_text) != None:
      print("Chatbot: {}".format(greeting(user_text)))
    else:
      print("Chatbot: {}".format(user_input(user_text)))
  
  else:
    print("Chatbot: {}".format(random.choice(goodbye_output)))
    break

Chatbot: Bonjour!  Thanks for chatting with me.
User: Hello
Chatbot: hello to you
User: Hi
Chatbot: hi, thanks for chatting
User: Bonjour
Chatbot: Bonjour?
User: Yes, it's French
Chatbot: French?
User: French is the language of France, are you familiar
Chatbot: Yeah, French guys from France.
User: Yes!  French men tend to be very attractive
Chatbot: Very attractive.
User: I appreciate the agreement!  How are you today?
Chatbot: How are you today?
User: I'm good, but I'm hoping my chatbot improves its performance
Chatbot: But I'm hoping.
User: What are you hoping for?
Chatbot: What are you, hoping?
User: Yes I am.  I told you.  Please listen actively to me
Chatbot: Please listen to me.
User: I am!  You're getting kind of rude with me
Chatbot: You're rude . . .
User: Wow!  I'm leaving
Chatbot: I'm leaving!
User: You can't, you're on a computer, there's nowhere for you to go!
Chatbot: There's nowhere else for it to go.
User: Exactly
Chatbot: Exactly.
User: Anyway, I need to try to make a 

--- 
The chatbot does ok.  The French discussion is somewhat coherent and there is some direct matching for simple user inputs (how are you today, I'm leaving, etc.).  You can tell that there's an emphasis on matching, vs having an appopriate reaction.  

Chatterbot model

---

In [None]:
!pip install chatterbot
!pip install chatterbot corpus

Collecting corpus
[?25l  Downloading https://files.pythonhosted.org/packages/f1/b9/120d9e0ae8702a6929946b494b723a4de6c9bf3d79e8e07e239a81be4e7c/Corpus-0.4.2.tar.gz (88kB)
[K     |████████████████████████████████| 92kB 3.5MB/s 
Building wheels for collected packages: corpus
  Building wheel for corpus (setup.py) ... [?25l[?25hdone
  Created wheel for corpus: filename=Corpus-0.4.2-cp36-none-any.whl size=88799 sha256=b8fc6f5cba0b34bfdd94dde090d094cd7c3b210f56a8dff2f424badc95147627
  Stored in directory: /root/.cache/pip/wheels/9d/20/6d/214e9c84ce43f62538d4c2f6e23d412bf9a52dd0f12bc716c9
Successfully built corpus
Installing collected packages: corpus
Successfully installed corpus-0.4.2


In [None]:
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer

# Use dialogs as vocabulary
chatbot = ChatBot('quotes')
chatbot.storage.drop()

listtrainer = ListTrainer(chatbot).train(quotes_sents)

List Trainer: [####################] 100%


In [None]:
# Try the ChatterBot bot
print("Chatbot: Bonjour!  Thanks for chatting with me.")

while(True):

  user_text = input("User: ")
  # Turn into lowercase to avoid mismatch
  user_text = user_text.lower()

  goodbye_input = ["bye", "i'm leaving", "see ya", "a demain", "i have to go", "goodbye"]
  goodbye_output = ["ok, goodbye", "goodbye, come back soon", "goodbyeeeeeeee"]

  greeting(user_text)

  if user_text not in goodbye_input:

    if greeting(user_text) != None:
      print("Chatbot: {}".format(greeting(user_text)))
    else:
      print("Chatbot: {}".format(chatbot.get_response(user_text)))
  
  else:
    print("Chatbot: {}".format(random.choice(goodbye_output)))
    break

Chatbot: Bonjour!  Thanks for chatting with me.
User: bonjour
Chatbot: Monsieur?
User: Oui, je suis un homme
Chatbot: He understands.
User: Are you the he?
Chatbot: As a gentleman You're sweet.
User: Why thank you!  I like to think that I'm pretty nice
Chatbot: Eber's Deep Conditioner every two days.
User: I don't usually wash my hair every time
Chatbot: No.
User: No what?
Chatbot: No, I won't go with you Why not?
User: I feel like you're sentence is running on a little too long
Chatbot: Eber's Deep Conditioner every two days.
User: I wonder why you said that
Chatbot: Half is fair.
User: This isn't quite as riveting a conversation as I had with my other chatbot.
Chatbot: They convinced her to let them buy her a meal in the city.
User: I didn't know chatbots could eat!
Chatbot: Joey Dorsey?
User: That's your name?
Chatbot: Forget it.
User: Ok
Chatbot: Not the hacking and gagging and spitting part.
User: Ask me a question
Chatbot: Where were you planning to stay during the two weeks that

---

This chatbot had a larger variety of responses than the self made one, though they weren't necessarily coherent.  There was also a repeated verse, which was given as a response to two very different user inputted text.