# Курсовая работа "Введение в обработку естественного языка"

## Часть 2. Чат-бот

In [35]:
import logging
import string
import annoy
import pickle
import numpy as np
import warnings
import re
import dialogflow
import os

from functools import lru_cache
from gensim.models import FastText
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from google.api_core.exceptions import InvalidArgument

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

##### Часть 2.1. Загрузка моделей

In [3]:
# Путь к моделям
PATH_MODEL = 'D:\\GB_DB\\models\\'
# Размер эмбеддинга
SIZE_EMB = 200

In [4]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

In [5]:
with open(f'{PATH_MODEL}idfs.pkl', 'rb') as f:
    idfs = pickle.load(f)

In [6]:
with open(f'{PATH_MODEL}midf.pkl', 'rb') as f:
    midf = pickle.load(f)

In [7]:
modelFT = FastText.load(f'{PATH_MODEL}modelFT')

In [8]:
ft_index = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index.load(f'{PATH_MODEL}ft_index') 

True

In [9]:
with open(f'{PATH_MODEL}index_map.pkl', 'rb') as f:
    index_map = pickle.load(f)

In [10]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
with open(f'{PATH_MODEL}vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [11]:
lr = LogisticRegression()
with open(f'{PATH_MODEL}lr.pkl', 'rb') as f:
    lr = pickle.load(f)

In [12]:
with open(f'{PATH_MODEL}idfs_p.pkl', 'rb') as f:
    idfs_p = pickle.load(f)

In [13]:
with open(f'{PATH_MODEL}midf_p.pkl', 'rb') as f:
    midf_p = pickle.load(f)

In [14]:
ft_index_shop = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index_shop.load(f'{PATH_MODEL}ft_index_shop') 

True

In [15]:
with open(f'{PATH_MODEL}index_map_shop.pkl', 'rb') as f:
    index_map_shop = pickle.load(f)

Функции подготовки текста

In [16]:
@lru_cache(maxsize=None)
def parse_morpher(text):
    return morpher.parse(text)[0].normal_form

In [17]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [parse_morpher(re.sub(r'\<[^>]*\>', '', i).lower()) for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [18]:
def embed_txt(txt, idfs, model, midf):
    n_ft = 0
    vector_ft = np.zeros(SIZE_EMB)
    for word in txt:
        if word in model:
            vector_ft += model[word]
            n_ft += idfs.get(word, midf)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    return vector_ft

##### Часть 2.2. Модель чат-бота

In [None]:
def startHandler(update: Update, context: CallbackContext) -> None:
    update.message.reply_text('Доброго времени суток!')

    
def dialogFlow(text):
    # Если ни продуктовая, ни разговорная мдель не нашли подходящего ответа, то воспользуемся DialogFlow от Google
    session_client = dialogflow.SessionsClient()
    session = session_client.session_path(DIALOGFLOW_PROJECT_ID, SESSION_ID)    
    text_input = dialogflow.types.TextInput(text=text, language_code=DIALOGFLOW_LANGUAGE_CODE)
    query_input = dialogflow.types.QueryInput(text=text_input)
    try:
        response = session_client.detect_intent(session=session, query_input=query_input)
        res_text = response.query_result.fulfillment_text
    except InvalidArgument:
        res_text = 'Не понимаю.'
    return res_text
    
    
def messageHandler(update: Update, context: CallbackContext) -> None:
    input_txt = preprocess_txt(update.message.text)
    # Сначала определим является ли запрос продуктовым
    vect = vectorizer.transform([" ".join(input_txt)])
    prediction = lr.predict(vect)
    
    if prediction[0] == 1:
        # Если запрос продуктовый, то найдём 3 самых подходящих товара
        find = False
        vect_ft = embed_txt(input_txt, idfs_p, modelFT, midf_p)
        ft_index_shop_val, distances_shop = ft_index_shop.get_nns_by_vector(vect_ft, 3, include_distances=True)
        for i, item in enumerate(ft_index_shop_val):
            if distances_shop[i] <= 0.3:
                title, image = index_map_shop[item]
                update.message.reply_text("title: {} image: {}".format(title, image))
                find = True
        if find == False:
            update.message.reply_text(dialogFlow(update.message.text))
    else:
        # Если запрос разговорный, то найдём ответ
        vect_ft = embed_txt(input_txt, idfs, modelFT, midf)
        ft_index_val, distances = ft_index.get_nns_by_vector(vect_ft, 1, include_distances=True)
        if distances[0] <= 0.3:
            update.message.reply_text(index_map[ft_index_val[0]])
        else:
            update.message.reply_text(dialogFlow(update.message.text))
            
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

updater = Updater(input('Token:'), use_context=True)

# DialogFlow
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'gbcpnlp-skdy-d17e062507d2.json'
DIALOGFLOW_PROJECT_ID = 'gbcpnlp-skdy' #PROJECT ID из DialogFlow 
DIALOGFLOW_LANGUAGE_CODE = 'ru' # язык
SESSION_ID = 'GBCPNLP_bot'  # ID бота из телеграма

dispatcher = updater.dispatcher
dispatcher.add_handler(CommandHandler("start", startHandler))
dispatcher.add_handler(MessageHandler(Filters.text & ~Filters.command, messageHandler))

updater.start_polling()
updater.idle()