In [1]:
import re
import json

In [2]:
# Importing the dataset
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [3]:
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [4]:
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [5]:
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [6]:
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [7]:
# Cleaning the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [8]:
# Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [9]:
# Filtering out the questions and answers that are too short or too long
short_questions = []
short_answers = []
i = 0
for question in clean_questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(clean_answers[i])
    i += 1
clean_questions = []
clean_answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        clean_answers.append(answer)
        clean_questions.append(short_questions[i])
    i += 1

In [10]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'seems like she could get a date easy enough',
 'that is a shame',
 'let me see what i can do',
 'right  see  you are ready for the quiz',
 'forget french',
 "well there's someone i think might be ",
 'i counted on you to help my cause you and that thug are obviously failing are not we ever going on our date',
 'you are sweet',
 "eber's deep conditioner every two days and i never ever use a blowdryer without the diffuser attachment",
 'i really really really wanna go but i ca not  not unless my sister goes',
 "i am workin' on it but she does not seem to be goin' for him",
 'lesbian  no i found a picture of jared leto in one of her drawers so i am pretty sure she is not harboring samesex tendencies',
 'so that is the kind of guy she likes pretty ones',
 'who knows  al

In [11]:
clean_questions

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i ca not date until she does',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend',
 "c'esc ma tete this is my head",
 "that is because it's such a nice one",
 'how is our little find the wench a date plan progressing',
 'you got something on your mind',
 'you have my word  as a gentleman',
 'how do you get your hair to look like that',
 'sure have',
 'i really really really wanna go but i ca not  not unless my sister goes',
 'she is n

In [12]:
len(clean_answers)

155117

In [13]:
len(clean_questions)

155117

In [26]:
intents = {
    "intents": []
}
for i in range(len(clean_answers)):
    intent = {
        "tag": str(i),
        "patterns": [],
        "responses": [],
        "context_set": ""
    }
    intent["patterns"].append(clean_questions[i])
    intent["responses"].append(clean_answers[i])
    intents["intents"].append(intent)

In [27]:
intents["intents"].append({"tag": "greeting",
         "patterns": ["Hi", "How are you", "Is anyone there?", "Hello", "Good day", "Whats up"],
         "responses": ["Hello!", "Good to see you again!", "Hi there, how can I help?"],
         "context_set": ""
        })

In [28]:
intents["intents"].append(
        {"tag": "goodbye",
         "patterns": ["cya", "See you later", "Goodbye", "I am Leaving", "Have a Good day"],
         "responses": ["Sad to see you go :(", "Talk to you later", "Goodbye!"],
         "context_set": ""
        })

In [29]:
intents["intents"].append(
        {"tag": "age",
         "patterns": ["how old", "how old is tim", "what is your age", "how old are you", "age?"],
         "responses": ["I am 18 years old!", "18 years young!"],
         "context_set": ""
        })

In [30]:
intents["intents"].append(
        {"tag": "name",
         "patterns": ["what is your name", "what should I call you", "whats your name?"],
         "responses": ["You can call me ChatBot.", "I'm a ChatBot!", "I'm ChatBot aka ChatBot from Scratch."],
         "context_set": ""
        })

In [31]:
intents

{'intents': [{'tag': '0',
   'patterns': ['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again'],
   'responses': ['well i thought we would start with pronunciation if that is okay with you'],
   'context_set': ''},
  {'tag': '1',
   'patterns': ['well i thought we would start with pronunciation if that is okay with you'],
   'responses': ['not the hacking and gagging and spitting part  please'],
   'context_set': ''},
  {'tag': '2',
   'patterns': ['not the hacking and gagging and spitting part  please'],
   'responses': ["okay then how 'bout we try out some french cuisine  saturday  night"],
   'context_set': ''},
  {'tag': '3',
   'patterns': ['you are asking me out  that is so cute what is your name again'],
   'responses': ['forget it'],
   'context_set': ''},
  {'tag': '4',
   'patterns': ['the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i ca not date until s

In [32]:
with open("intents.json", "w") as outfile:  
    json.dump(intents, outfile)