### Dependencies

In [1]:
import pandas as pd
import numpy as np
import string
import re

### Load data 

In [2]:
preproc_data = "../botData/preproc/sentences_output.xlsx"

In [3]:
customer_sentences = pd.read_excel(preproc_data)
customer_sentences.head(10)

Unnamed: 0.1,Unnamed: 0,english,_czech,_intent
0,0.0,Can I have the menu please?,Můžete mi přinést jídelní lístek?\t\t,ask_menu
1,1.0,What is the soup of the day?,Jaká je polévka dne?\t\t,ask_info
2,2.0,What are today's specials?,Jaké jsou speciality dne?\t\t,ask_info
3,5.0,Do you have any vegetarian meatless dishes?,Máte nějaká vegetariánská / bezmasá jídla?\t\t,ask_info
4,6.0,I'll have,Dám si,order_food
5,9.0,Check please!,Zaplatíme!\t\t,pay_order
6,10.0,Do you take credit cards?,Berete kreditní karty?\t\t,pay_order
7,11.0,Is service included?,Je obsluha v ceně?\t\t,ask_info
8,12.0,I'm on a diet,Mám dietu. / Držím dietu.\t\t,fallback
9,13.0,I have an allergy to,Mám alergii na,inform_allergy


<p> As can be seen above the 'czech' column needs some data cleaning, we remove remaning tags, accents, and lowercase all the sentences</p>

### Additional Preparation steps Helpers

#### Lowercase

In [4]:
def lowercase(sentence): return sentence.lower()

#### Remove string control characters

In [5]:
def remove_tags(sentence):
    regex = re.compile(r'[\n\r\t]')
    sentence = regex.sub("", sentence)
    return sentence

#### Remove punctuation

In [6]:
def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '',string.punctuation))

#### Steps Applied
<p> Only Czech Column is relevant, this is the one we will use to learn the vector representation, therefore the steps are only applied to this dataframe column: </p>

In [7]:
customer_sentences["_czech"] = customer_sentences["_czech"].apply(lowercase)
customer_sentences["_czech"] = customer_sentences["_czech"].apply(remove_tags)
customer_sentences["_czech"] = customer_sentences["_czech"].apply(remove_punctuation)

<p> Observe the obtained output: </p>

In [8]:
customer_sentences.head(10)

Unnamed: 0.1,Unnamed: 0,english,_czech,_intent
0,0.0,Can I have the menu please?,můžete mi přinést jídelní lístek,ask_menu
1,1.0,What is the soup of the day?,jaká je polévka dne,ask_info
2,2.0,What are today's specials?,jaké jsou speciality dne,ask_info
3,5.0,Do you have any vegetarian meatless dishes?,máte nějaká vegetariánská bezmasá jídla,ask_info
4,6.0,I'll have,dám si,order_food
5,9.0,Check please!,zaplatíme,pay_order
6,10.0,Do you take credit cards?,berete kreditní karty,pay_order
7,11.0,Is service included?,je obsluha v ceně,ask_info
8,12.0,I'm on a diet,mám dietu držím dietu,fallback
9,13.0,I have an allergy to,mám alergii na,inform_allergy


#### Vectorizing Czech. Baseline Experiment
<p> Because Czech grammar is so different and difficult (at least for me) I will ignore the logical structure of the text and represent it as a vector </p>

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
vectorizer = TfidfVectorizer(strip_accents='ascii')
vectorizer.fit(np.concatenate((customer_sentences._czech, customer_sentences._intent)))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
# returns the TFIDF for the possible sentences the customer will say:
customer_sentences_vector = vectorizer.transform(customer_sentences._czech)

In [12]:
# how does that look 
arr_csvect = customer_sentences_vector.toarray()
print(arr_csvect, "shape:", arr_csvect.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] shape: (73, 186)


####  Test the Ranking of Intents 


In [13]:
print("You can start chatting  now.")
while True:
    # Read user input
    input_question = input()
    if input_question == "stop":
        print("ahoj")
        break 

    # Locate the closest question
    input_question_vector = vectorizer.transform([input_question])

    # Compute similarities
    similarities = cosine_similarity(input_question_vector, customer_sentences_vector)

    # Find the closest question
    closest = np.argmax(similarities, axis=1)

    # Print the correct answer
    print("BOT: " + customer_sentences._intent.iloc[closest].values[0])

You can start chatting  now.
zaplatit
BOT: pay_order
stop
ahoj
