In [16]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import json
import operator
import numpy as np
import sys
import os
import random 
from yargy import Parser, rule, and_
from yargy.predicates import gram, is_capitalized, dictionary
sys.path.insert(0, '..')
from Common import preprocessing
classes_map = {'DOC':0, 'ENTER':1, 'ORG':2, 'PRIV':3, 'RANG':4, 'HOST':5}

idx_to_intent = {0:'DOC', 1:'ENTER', 2:'ORG', 3:'PRIV', 4:'RANG', 5:'HOST'}

In [17]:
df = pd.read_csv('..//Data//translation.csv', delimiter=';', engine='python',encoding='utf8')

In [18]:
questions = np.array(df.question)
questions = preprocessing.preprocess_eng_list(questions)

In [19]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
X = vectorizer.fit_transform(questions)
print("Размерность:",X.shape)

Размерность: (1342, 880)


In [20]:
classes = np.array(df['class'])
y = list(map(lambda x: classes_map[x],classes))

In [21]:
log_reg = OneVsRestClassifier(LogisticRegression(random_state=0,C=10,solver='lbfgs',)).fit(X, y)

In [22]:
for i,j in sorted(zip(log_reg.estimators_[0].coef_[0],vectorizer.vocabulary_)):
    print(i,j)

-5.177020947316017 evidence
-3.4815542996636113 case
-3.053888592111299 education
-2.962406322078189 increase
-2.9534766041631757 real
-2.838646031057161 re
-2.700288287205313 icon_smile
-2.698966520878192 count
-2.6658481298580505 title
-2.6319238735481862 score
-2.614394778961255 series
-2.5420944866715365 more
-2.437262334552356 another
-2.4335248188592637 relative
-2.391492895369421 residence
-2.3558015249422724 creative
-2.3553739742461897 available
-2.25373662165844 it
-2.224458312701187 give
-2.2010545700675035 residents
-2.1833125629374335 loss
-2.177377859231175 republic
-2.077920313542177 certify
-2.063548542501912 did
-2.059235597625979 just
-2.0574273941683194 sociology
-1.9820063211599932 moment
-1.9496079358593188 very
-1.9413206647865668 plus
-1.936952108386646 gave
-1.935905267626167 area
-1.9235128891272788 departments
-1.9233075133580526 so
-1.9195386070130853 employment
-1.8974519948609763 per
-1.879233347090946 begin
-1.8740529806722068 rest
-1.8714164521136858 cons

In [34]:
def get_subintent(preprocessed,intent):
    data = None
    with open("../knowledge base/en/{0}.json".format(intent)) as f:
        data = f.read()
    data = json.loads(data)
    probas = {}
    for subintent in data:
        count = 0
        for keyword in data[subintent]['keywords']:
            RULE = rule(dictionary({keyword}))
            parser = Parser(RULE)
            for match in parser.findall(preprocessed):
                count = count + len(match.tokens)
        probas[subintent] = count/len(data[subintent]['keywords'])
    print("Вероятности субинтентов",probas)
    if any(list(probas.values())) > 0.0:
        subintent = max(probas.items(), key=operator.itemgetter(1))[0]
        return data[subintent]['response'][0]
    else:
        return fallback(preprocessed)
    
    

In [123]:
def get_answer(raw_text):
    preprocessed = preprocessing.preprocess_eng_list([raw_text])
    print("== Продобработанный текст:",preprocessed)
    v = vectorizer.transform(preprocessed)
    probas = log_reg.predict_proba(v)
    print("== Вероятности интентов",probas[0])
    
    if max(probas[0])<0.43:
        answer = fallback(preprocessed)
        print("== Ответ: ",answer)
        #os.system("echo "" " + answer + " "" | RHVoice-test")
    else:
        if list(probas[0]).index(max(probas[0])) == 3:
            intent = 2
        else:
            intent = idx_to_intent[np.argmax(probas[0])]
        answer = get_subintent(str(preprocessed),intent)
        print("== Ответ: ",answer)
        #os.system("echo "" " + answer + " "" | RHVoice-test")


In [124]:
def fallback(text):
    return "I dont understand, sorry. Can you reask in different way please"

In [125]:
def tests():
    list_ = np.array(df.question)
    i = random.randint(0,len(list_))
    print(list_[i])
    get_answer(list_[i])

In [157]:
tests()

Please tell me when will be known the date of the creative competition for admission to the faculty of design?
== Продобработанный текст: ['tell me when known date of creative competition for admission to faculty of design']
== Вероятности интентов [0.05110131 0.02391418 0.07237396 0.02157696 0.82137429 0.00965929]
Вероятности субинтентов {'DATES': 0.14285714285714285, 'WHERE': 0.25, 'SCORE': 0.0, 'ISSUE': 0.0}
== Ответ:  You can find all needed information about results and rating on our website


In [147]:
get_answer("Hello! My situation is this: son in the summer of 2012, was expelled from the University after 1 year. He received a summons in the autumn, came to the draft Board, then the second. In parallel we are preparing to enroll in a new University in the summer of 2013. If he does have the right to take the army? And if you take away the first year - he could well recover next year?")

== Продобработанный текст: ['My situation this son in summer of was expelled from University after year He received summons in autumn came to draft Board then second In parallel we are preparing to enroll in new University in summer of If he does have right to take army And if you take away first year he could well recover next year']
== Вероятности интентов [0.00346106 0.72812879 0.1831188  0.01104572 0.06540189 0.00884373]
Вероятности субинтентов {'PRIV': 0.125, 'COST': 0.0, 'REENTER': 0.25, 'BUDGET': 0.0, 'DATES': 0.0, 'EXAMS': 0.0, 'ISSUE': 0.0}
== Ответ:  We are always welcome people, who wants to continue study. Go to our website pstu.ru to check all the info


In [None]:
#цикл while с тремя состояниями - привет, вопросы, пока
#если порог вероятности ниже трешхолда скидываем на рандомную фразу
#(после приветствия спрашивать имя и тд и запоминать)
#если несколько вопросов в одном - разделять