# Курсовая работа "Введение в обработку естественного языка"

## Часть 1. Обучение чат-бота

In [5]:
import string
import annoy
import pickle
import numpy as np
import pandas as pd
import warnings
import re

from functools import lru_cache
from gensim.models import FastText
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
# Путь к дата-сетам
PATH_BASE = 'D:\\GB_DB\\'
# Путь к моделям
PATH_MODEL = 'D:\\GB_DB\\models\\'
# Размер эмбеддинга
SIZE_EMB = 200

##### Часть 1.1. Обучение разговорной модели

Выделяем ответы в отдельный файл

In [None]:
question = None
written = False

with open(f'{PATH_BASE}prepared_answers.txt', "w", encoding="utf8") as fout:
    with open(f'{PATH_BASE}Otvety.txt', "r", encoding="utf8") as fin:
        for line in tqdm_notebook(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

Функции подготовки текста

In [4]:
@lru_cache(maxsize=None)
def parse_morpher(text):
    return morpher.parse(text)[0].normal_form

In [5]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [parse_morpher(re.sub(r'\<[^>]*\>', '', i).lower()) for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [6]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

Токенизация текста

In [29]:
%%time
sentences = []

with open(f'{PATH_BASE}Otvety.txt', "r", encoding="utf8") as fin:
    for line in tqdm_notebook(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)
        
sentences = [i for i in sentences if len(i) > 2]

with open(f'{PATH_MODEL}sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Wall time: 24min 7s


In [30]:
with open(f'{PATH_MODEL}sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

In [31]:
len(sentences)

4849714

Обучение модели TF-IDF

In [32]:
# Функция-пустышка, для возможности передать в TfidfVectorizer список уже готовых токенов, а не текст
def simple_tokenizer(x):
    return x

In [42]:
%%time
tfidf_v = TfidfVectorizer(tokenizer=simple_tokenizer, lowercase=False, min_df=2)
tfidf_v.fit_transform(sentences)

idfs = {v[0]: v[1] for v in zip(tfidf_v.vocabulary_, tfidf_v.idf_)}
midf = np.mean(tfidf_v.idf_)

with open(f'{PATH_MODEL}idfs.pkl', 'wb') as f:
    pickle.dump(idfs, f)
with open(f'{PATH_MODEL}midf.pkl', 'wb') as f:
    pickle.dump(midf, f)

Wall time: 1min 3s


In [8]:
with open(f'{PATH_MODEL}idfs.pkl', 'rb') as f:
    idfs = pickle.load(f)

In [9]:
with open(f'{PATH_MODEL}midf.pkl', 'rb') as f:
    midf = pickle.load(f)

Обучение модели FastText

In [46]:
%%time
modelFT = FastText(sentences=sentences, size=SIZE_EMB, min_count=2, window=5, workers=8, seed=34)
modelFT.save(f'{PATH_MODEL}modelFT')

Wall time: 21min 17s


In [7]:
modelFT = FastText.load(f'{PATH_MODEL}modelFT')

Загружаем ответы в модель приближенного поиска

In [10]:
%%time
ft_index = annoy.AnnoyIndex(SIZE_EMB, 'angular')
index_map = {}
counter = 0

with open(f'{PATH_BASE}prepared_answers.txt', "r", encoding="utf8") as f:
    for line in tqdm_notebook(f):
        n_ft = 0
        spls = line.split("\t")
        index_map[counter] = re.sub(r'\<[^>]*\>', '', spls[1]) # Удалим html-тэги
        question = preprocess_txt(spls[0])
        vector_ft = np.zeros(SIZE_EMB)
        
        for word in question:
            if word in modelFT:
                vector_ft += modelFT[word]
                n_ft += idfs.get(word, midf)
        if n_ft > 0:
            vector_ft = vector_ft / n_ft
        ft_index.add_item(counter, vector_ft)
        counter += 1
        
ft_index.build(50)
ft_index.save(f'{PATH_MODEL}ft_index')

with open(f'{PATH_MODEL}index_map.pkl', 'wb') as f:
    pickle.dump(index_map, f)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Wall time: 8min 44s


In [None]:
ft_index = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index.load(f'{PATH_MODEL}ft_index') 

In [None]:
with open(f'{PATH_MODEL}index_map.pkl', 'rb') as f:
    index_map = pickle.load(f)

##### Часть 1.2. Обучение продуктовой модели

In [11]:
%%time
shop_data = pd.read_csv(f'{PATH_BASE}ProductsDataset.csv')

shop_data['text'] = shop_data['title'] + " " + shop_data["descrirption"]
shop_data['text'] = shop_data['text'].apply(lambda x: preprocess_txt(str(x)))
shop_data.head(3)

Wall time: 8.25 s


Unnamed: 0,title,descrirption,product_id,category_id,subcategory_id,properties,image_links,text
0,Юбка детская ORBY,"Новая, не носили ни разу. В реале красивей чем...",58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...,"[юбка, детский, orby, новый, носить, реал, кра..."
1,Ботильоны,"Новые,привезены из Чехии ,указан размер 40,но ...",5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...,"[ботильон, новыепривезти, чехия, указать, разм..."
2,Брюки,Размер 40-42. Брюки почти новые - не знаю как ...,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...,"[брюки, размер, 4042, брюки, новый, знать, мер..."


In [12]:
%%time
idxs = set(np.random.randint(0, len(index_map), len(shop_data)))
negative_texts = [" ".join(preprocess_txt(index_map[i])) for i in idxs]
positive_texts = [" ".join(val) for val in shop_data['text'].values]

Wall time: 19.9 s


In [13]:
dataset = negative_texts + positive_texts
labels = np.zeros(len(dataset))
labels[len(negative_texts):] = np.ones(len(positive_texts))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, stratify=labels, random_state=13)

In [15]:
%%time
vectorizer = CountVectorizer(ngram_range=(1, 2))
x_train_vec = vectorizer.fit_transform(X_train)
x_test_vec = vectorizer.transform(X_test)

with open(f'{PATH_MODEL}vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

Wall time: 6.04 s


In [16]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
with open(f'{PATH_MODEL}vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [16]:
lr = LogisticRegression()
lr.fit(x_train_vec, y_train)

with open(f'{PATH_MODEL}lr.pkl', 'wb') as f:
    pickle.dump(lr, f)

In [18]:
lr = LogisticRegression()
with open(f'{PATH_MODEL}lr.pkl', 'rb') as f:
    lr = pickle.load(f)

In [17]:
accuracy_score(y_true=y_test, y_pred=lr.predict(x_test_vec))

0.9787460148777896

In [18]:
%%time
tfidf_p = TfidfVectorizer(lowercase=False, min_df=2)
tfidf_p.fit(X_train)

idfs_p = {v[0]: v[1] for v in zip(tfidf_p.vocabulary_, tfidf_p.idf_)}
midf_p = np.mean(tfidf_p.idf_)

with open(f'{PATH_MODEL}idfs_p.pkl', 'wb') as f:
    pickle.dump(idfs_p, f)
with open(f'{PATH_MODEL}midf_p.pkl', 'wb') as f:
    pickle.dump(midf_p, f)

Wall time: 1.3 s


In [None]:
with open(f'{PATH_MODEL}idfs_p.pkl', 'rb') as f:
    idfs_p = pickle.load(f)

In [None]:
with open(f'{PATH_MODEL}midf_p.pkl', 'rb') as f:
    midf_p = pickle.load(f)

In [19]:
%%time
ft_index_shop = annoy.AnnoyIndex(SIZE_EMB, 'angular')
index_map_shop = {}
counter = 0

for i in tqdm_notebook(range(len(shop_data))):
    n_ft = 0
    index_map_shop[counter] = (shop_data.loc[i, "title"], shop_data.loc[i, "image_links"])
    vector_ft = np.zeros(SIZE_EMB)
    for word in shop_data.loc[i, "text"]:
        if word in modelFT:
            vector_ft += modelFT[word]
            n_ft += idfs_p.get(word, midf_p)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    ft_index_shop.add_item(counter, vector_ft)
    counter += 1

ft_index_shop.build(50)
ft_index_shop.save(f'{PATH_MODEL}ft_index_shop')

with open(f'{PATH_MODEL}index_map_shop.pkl', 'wb') as f:
    pickle.dump(index_map_shop, f)

HBox(children=(FloatProgress(value=0.0, max=35548.0), HTML(value='')))


Wall time: 8.9 s


In [None]:
ft_index_shop = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index_shop.load(f'{PATH_MODEL}ft_index_shop') 

In [None]:
with open(f'{PATH_MODEL}index_map_shop.pkl', 'rb') as f:
    index_map_shop = pickle.load(f)

Ссылка на вторую часть проекта - сам чат-бот: 