In [2]:
import pymorphy2
import re
import psycopg2
import sklearn
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import BallTree
from sklearn.base import BaseEstimator
import json

In [3]:
def softmax(x):
  #создание вероятностного распределения
  proba = np.exp(-x)
  return proba / sum(proba)

In [4]:
class NeighborSampler(BaseEstimator):
  def __init__(self, k=5, temperature=10.0):
    self.k=k
    self.temperature = temperature
  def fit(self, X, y):
    self.tree_ = BallTree(X)
    self.y_ = np.array(y)
  def predict(self, X, random_state=None):
    distances, indices = self.tree_.query(X, return_distance=True, k=self.k)
    result = []
    resultDist = []
    for distance, index in zip(distances, indices):
      result.append(np.random.choice(index, p=softmax(distance * self.temperature)))
      resultDist.append(np.random.choice(distance, p=softmax(distance * self.temperature)))
    return self.y_[result] , resultDist

In [8]:
class Bot(object):
    def __init__(self):
        self.morph = pymorphy2.MorphAnalyzer(lang='ru')
    def getData(self):
        with open('chatbot/answer2.json','r') as f:
         response = json.load(f)['items']
        self.answer = dict()
        answer_id=[] 
        questions=[]
        for i in response:
               self.answer[i['id']]=i['answer']
        with open('chatbot/question2.json','r') as f:
          response = json.load(f)['items']
        # посчитаем количество вопросов
        transform=0

        for row in response:
          if row['question']>"":
           if row['answerid']>0:
            phrases=row['question']
   
            # разбираем вопрос на слова
           words=phrases.split(' ')
           phrase=""
           for word in words:
                word = self.morph.parse(word)[0].normal_form  
                phrase = phrase + word + " "
                # Если длинна полученной фразы больше 0 добавляем ей в массив вопросов и массив кодов ответов
           if (len(phrase)>0):
             questions.append(phrase.strip())
             answer_id.append(row['answerid'])
             transform=transform+1
        return transform , self.answer , questions , answer_id
    def buildModel(self):
           transform , answer , questions , answer_id = self.getData()
           vectorizer_q = TfidfVectorizer()
           vectorizer_q.fit(questions)
           matrix_big_q = vectorizer_q.transform(questions)
           if transform>200:
              transform=200
           svd_q = TruncatedSVD(n_components=transform)
           svd_q.fit(matrix_big_q)
           matrix_small_q = svd_q.transform(matrix_big_q)
           ns_q = NeighborSampler()
           ns_q.fit(matrix_small_q, answer_id) 
           self.pipe_q = make_pipeline(vectorizer_q, svd_q, ns_q)
    def send(self,command):
         words= re.split('\W',command)
         phrase=""
         for word in words:
           word = self.morph.parse(word)[0].normal_form  # морфируем слово вопроса в нормальную словоформу
           # Нормализуем словоформу каждого слова и соберем обратно фразу
           phrase = phrase + word + " "
         result = self.pipe_q.predict([phrase.strip()])
         reply_id = int(result[0])
         return self.answer[reply_id] , result[1][0]

In [9]:
bot = Bot()


In [10]:
bot.buildModel()

In [11]:
bot.send('Привет')

('Привет. Я бот Roboto и с радостью отвечу на ваши вопросы', 0.0)