In [15]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import json
import operator
import pickle
import numpy as np
import sys
import os
import random
from yargy import Parser, rule, and_
from yargy.predicates import gram, is_capitalized, dictionary
sys.path.insert(0, '..')
from Common import preprocessing
classes_map = {'DOC':0, 'ENTER':1, 'ORG':2, 'PRIV':3, 'RANG':4, 'HOST':5}
classes_map_greet = {'QUE':0, 'GREET':1}

idx_to_intent = {0:'DOC', 1:'ENTER', 2:'ORG', 3:'PRIV', 4:'RANG', 5:'HOST'}

In [2]:
WAS_GREETING = False #было ли приветствие
#KNOW_NAME = False #знаем ли имя

In [3]:
df = pd.read_csv('..//Data//translation.csv', delimiter=';', engine='python',encoding='utf8')
gr = pd.read_csv('..//Data//chatter.txt', delimiter=';', engine='python',encoding='utf8')
df_ = pd.DataFrame(data={'question':df['question'],'class':df['class']})[:40]
df_['class'] = 'QUE'
df_ = df_.append(gr)

In [4]:
questions = np.array(df.question)
questions = preprocessing.preprocess_eng_list(questions)

In [17]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))

X = vectorizer.fit_transform(questions)
pickle.dump(vectorizer, open("vectorizer", 'wb'))
classes = np.array(df['class'])
y = list(map(lambda x: classes_map[x],classes))
print("Размерность:",X.shape)

Размерность: (1342, 888)


In [19]:
greetings = np.array(df_.question)
greetings = preprocessing.preprocess_eng_greetings_list(greetings)

vectorizer_greet = TfidfVectorizer(min_df=3,ngram_range=(1,1))
X_greet = vectorizer_greet.fit_transform(greetings)
pickle.dump(vectorizer_greet, open("vectorizer_greet", 'wb'))
classes = np.array(df_['class'])
y_greet = list(map(lambda x: classes_map_greet[x],classes))
print("Размерность:",X_greet.shape)

Размерность: (88, 102)


In [16]:
log_reg = OneVsRestClassifier(LogisticRegression(random_state=0,C=10,solver='lbfgs',)).fit(X, y)
#pickle.dump(log_reg, open("log_reg", 'wb'))
log_reg_greet = OneVsRestClassifier(LogisticRegression(random_state=0,C=10,solver='lbfgs',)).fit(X_greet, y_greet)
#pickle.dump(log_reg_greet, open("log_reg_greet", 'wb'))

In [8]:
def get_subintent(preprocessed,intent):
    data = None
    with open("../knowledge base/en/{0}.json".format(intent)) as f:
        data = f.read()
    data = json.loads(data)
    probas = {}
    for subintent in data:
        count = 0
        for keyword in data[subintent]['keywords']:
            RULE = rule(dictionary({keyword}))
            parser = Parser(RULE)
            for match in parser.findall(preprocessed):
                count = count + len(match.tokens)
        probas[subintent] = count/len(data[subintent]['keywords'])
    print("Вероятности субинтентов",probas)
    if any(list(probas.values())) > 0.0:
        subintent = max(probas.items(), key=operator.itemgetter(1))[0]
        return data[subintent]['response'][0]
    else:
        return fallback(preprocessed)

In [9]:
def chit_chat(preprocessed):
    preprocessed = str(preprocessed)
    data = None
    with open("../knowledge base/en/CHITCHAT.json") as f:
        data = f.read()
    data = json.loads(data)
    probas = {}
    for subintent in data:
        count = 0
        for keyword in data[subintent]['keywords']:
            RULE = rule(dictionary({keyword}))
            parser = Parser(RULE)
            for match in parser.findall(preprocessed):
                count = count + len(match.tokens)
        probas[subintent] = count/len(data[subintent]['keywords'])
    print("Вероятности subgreeting",probas)
    if any(list(probas.values())) > 0.0:
        subintent = max(probas.items(), key=operator.itemgetter(1))[0]
        return data[subintent]['response'][0]
    else:
        return fallback(preprocessed)

In [10]:
def get_answer(raw_text,WAS_GREETING):
    preprocessed = preprocessing.preprocess_eng_greetings_list([raw_text])
    print("== Продобработанный текст:",preprocessed)
    
    #классифицируем это вопрос по делу или "как дела че делаешь"
    v_ = vectorizer_greet.transform(preprocessed)
    probas = log_reg_greet.predict_proba(v_)
    print("== Вероятности greeting или нет",probas[0])
    
    if not WAS_GREETING:
        #классифицируем greeting или нет
        WAS_GREETING = True
        if probas[0][0] < probas[0][1]+0.1: #если Greeting
            answer = chit_chat(raw_text)
            print("== Ответ: ",answer)
            os.system("echo "" " + answer + " "" | RHVoice-test")
            return
    else:
        if probas[0][0] < probas[0][1]-0.7: #если Greeting
            answer = chit_chat(raw_text)
            print("== Ответ: ",answer)
            os.system("echo "" " + answer + " "" | RHVoice-test")
            return

    #если по делу
    v = vectorizer.transform(preprocessed)
    probas = log_reg.predict_proba(v)
    print("== Вероятности интентов",probas[0])

    if max(probas[0])<0.43:
        answer = fallback(preprocessed)
        print("== Ответ: ",answer)
        os.system("echo "" " + answer + " "" | RHVoice-test")
    else:
        if list(probas[0]).index(max(probas[0])) == 3: #объединяем интенты (тк некорректна выборка)
            intent = 2
        else:
            intent = idx_to_intent[np.argmax(probas[0])]
        answer = get_subintent(str(preprocessed),intent)
        print("== Ответ: ",answer)
        os.system("echo "" " + answer + " "" | RHVoice-test")


In [11]:
def fallback(text):
    #TODO readfromfile
    return "I dont understand, sorry. Can you reask in different way please"

In [12]:
def tests():
    list_ = np.array(df.question)
    i = random.randint(0,len(list_))
    print(list_[i])
    get_answer(list_[i])

In [14]:
WAS_GREETING = True
get_answer("What documents do i need?",WAS_GREETING)

== Продобработанный текст: ['What documents do i need']
== Вероятности greeting или нет [0.46734172 0.53265828]
== Вероятности интентов [0.86899793 0.05235625 0.03907908 0.00107282 0.00276139 0.03573254]
Вероятности субинтентов {'ONLINE': 0.0, 'WHAT': 0.25, 'WHERE': 0.0, 'DATES': 0.1, 'HOW': 0.0, 'ISSUE': 0.0}
== Ответ:  To apply you need following documents: passport, diploma, 2 photos, medical ref. and exam certificate.


In [40]:
#сделать все функции в один класс
#(после приветствия спрашивать имя и тд и запоминать)
#если несколько вопросов в одном - разделять