## Проект Телеграм чат-бота

In [18]:
# TODO: не забыть залить из DEV ветки!!!
# t.me/voki_blabla_bot

In [19]:
# !pip install python-dotenv
# !pip install -U python-telegram-bot
# !pip install python-telegram-bot==12.4.2 --upgrade
# !pip install -U annoy

In [55]:
import os
import string
import numpy as np
import pandas as pd
import pickle

from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from gensim.models import Word2Vec, FastText
import annoy

from dotenv import load_dotenv
from tqdm import tqdm_notebook, tqdm


In [53]:
ENV_FILE = 'chat.env'

DATA_PATH = '../data/'
RAW_FILE = 'Otvety.txt'
ANSWERS_FILE = 'prep_answers.txt'
PRODUCT_FILE = 'ProductsDataset.csv'
FAST_TEXT_MODEL = './models/ft_model'
INDEX_ANSWERS = './models/idx_answers.pkl'
FT_INDEX_FILE = './models/ft_index'

In [22]:
# загрузка файла с настройками переменных окружения

# dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
if os.path.exists(ENV_FILE):
    load_dotenv(ENV_FILE)

In [23]:
# инициализация анализатора, стоп слов, пунктуации
morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

In [24]:
def preprocess_txt(line):
    """ предобработка перед векторизацией """
    spls = "".join(i for i in line.strip() if i not in exclude).split()
#     print(spls)
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
#     print(spls)
    spls = [i for i in spls if i not in sw and i != ""]
#     print(spls)
    return spls

In [54]:
file_input = os.path.join(DATA_PATH, RAW_FILE)
file_answers = os.path.join(DATA_PATH, ANSWERS_FILE)
file_product = os.path.join(DATA_PATH, PRODUCT_FILE)

## Подготовка ответов болталки

In [26]:
# подготовка файло вопрос-ответ
question = None
written = False

with open(file_answers, "w", encoding="utf8") as fout:
    with open(file_input, "r", encoding="utf8") as fin:
        for line in tqdm(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

7550926it [00:13, 572144.09it/s]


In [27]:
# подготовка корпуса для обучения векторизатора
sentences = []
c = 0
STOP = 500000

with open(file_input, "r", encoding="utf8") as fin:
    for line in tqdm(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)
        c += 1
        if c > STOP:
            break

500000it [24:58, 333.71it/s]


#### Обучение векторизатора

In [29]:
VECTOR_SIZE = 200

In [30]:
%%time
sentences = [i for i in sentences if len(i) > 2]
modelFT = FastText(sentences=sentences, size=VECTOR_SIZE, min_count=1, window=5)
modelFT.save(FAST_TEXT_MODEL)

CPU times: user 10min 27s, sys: 6.77 s, total: 10min 34s
Wall time: 6min 36s


#### Подготовка индексов для NN поиска

In [40]:
modelFT = FastText.load(FAST_TEXT_MODEL)

In [46]:
%%time
ft_index = annoy.AnnoyIndex(VECTOR_SIZE ,'angular')

index_map = {}
counter = 0

with open(file_answers, "r", encoding='utf-8') as f:
    for line in tqdm(f):
        n_ft = 0
        spls = line.split("\t")
        index_map[counter] = spls[1]
        question = preprocess_txt(spls[0])
        vector_ft = np.zeros(VECTOR_SIZE)
        for word in question:
            if word in modelFT.wv:
                vector_ft += modelFT.wv[word]
                n_ft += 1
        if n_ft > 0:
            vector_ft = vector_ft / n_ft
        ft_index.add_item(counter, vector_ft)
            
        counter += 1

1163342it [59:05, 328.15it/s]

CPU times: user 59min 21s, sys: 22.8 s, total: 59min 44s
Wall time: 59min 5s





In [48]:
# сохраняем словарь индексов ответов
with open(INDEX_ANSWERS, 'wb') as f:
    pickle.dump(index_map, f)

In [49]:
%%time
# строим индекс поиска и сохраняем
ft_index.build(25)
ft_index.save(FT_INDEX_FILE)

CPU times: user 1min 36s, sys: 805 ms, total: 1min 37s
Wall time: 40.6 s


True

## Подготовка работы с основной продукцией

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [56]:
shop_data = pd.read_csv(file_product)
shop_data['text'] = shop_data['title'] + " " + shop_data["descrirption"]
shop_data['text'] = shop_data['text'].apply(lambda x: preprocess_txt(str(x)))
shop_data.head(2)

Unnamed: 0,title,descrirption,product_id,category_id,subcategory_id,properties,image_links,text
0,Юбка детская ORBY,"Новая, не носили ни разу. В реале красивей чем...",58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...,"[юбка, детский, orby, новый, носить, реал, кра..."
1,Ботильоны,"Новые,привезены из Чехии ,указан размер 40,но ...",5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...,"[ботильон, новыепривезти, чехия, указать, разм..."


In [59]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

### ====================================================================

In [5]:
TG_token = os.getenv('TG_voki_blabla_bot_token') # Токен API к Telegram

updater = Updater(token=TG_token) 
dispatcher = updater.dispatcher

def startCommand(update, context):
    """Send a message when the command /start is issued."""
    update.message.reply_text(f'Добрейшего дня, {update.message.from_user.username}')
    
    
# def startCommand(bot, update):
#     bot.send_message(chat_id=update.message.chat_id, text='Добрый день')

# def textMessage(bot, update):
    
#     input_txt = preprocess_txt(update.message.text)
#     vect = vectorizer.transform([" ".join(input_txt)])
#     prediction = lr.predict(vect)
    
#     if prediction[0] == 1:
#         vect_ft = embed_txt(input_txt, idfs, midf)
#         ft_index_shop_val = ft_index_shop.get_nns_by_vector(vect_ft, 5)
#         for item in ft_index_shop_val:
#             title, image = index_map_shop[item]
#             bot.send_message(chat_id=update.message.chat_id, text="title: {} image: {}".format(title, image))
#         return
#     vect_ft = embed_txt(input_txt, {}, 1)
#     ft_index_val, distances = ft_index.get_nns_by_vector(vect_ft, 1, include_distances=True)
#     if distances[0] > 0.2:
#         print(distances[0])
#         bot.send_message(chat_id=update.message.chat_id, text="Моя твоя не понимать")
#         return
#     bot.send_message(chat_id=update.message.chat_id, text=index_map[ft_index_val[0]])
    
def echo(update, context):
    """Echo the user message."""
    update.message.reply_text(update.message.text)


# обработчики комманд
dispatcher.add_handler(CommandHandler('start', startCommand))
# dispatcher.add_handler(CommandHandler('start', startCommand))

# 
dispatcher.add_handler(MessageHandler(Filters.text & ~Filters.command, echo))
# text_message_handler = MessageHandler(Filters.text, textMessage)
# dispatcher.add_handler(text_message_handler)
updater.start_polling(clean=True)
updater.idle()