# косинусная модель

### считаем данные

In [21]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pymorphy2
import numpy as np

In [15]:
# data and functions declared with bot launch
df = pd.read_excel('answers_clear.xls').drop('Unnamed: 0', axis=1)
morph = pymorphy2.MorphAnalyzer()
count = CountVectorizer()
sample_transformed = count.fit_transform(df['вопрос'])
with open('stop_words.txt','r') as inFile:
    stop_list = set(inFile.read().split())
    
def preprocessing(message):
    '''
    Extracting russian terms, normalizing them and erasing stop-words
    '''
    russian_terms =  re.findall(r'[а-яА-Яё]+', message)
    normalized_terms = list(map(lambda x: morph.parse(x)[0].normal_form, russian_terms))
    return ' '.join([term for term in normalized_terms if term not in stop_list])

In [26]:
def get_best_three(question, n=3, vect_sample=sample_transformed, vectorizer=count, df=df):
    vect_quest = vectorizer.transform([preprocessing(question)])
    cosine_similarities = cosine_similarity(vect_quest, vect_sample).flatten()
    indices = (-cosine_similarities).argsort()[:n]    
    for ans in df.iloc[list(indices), 2:].values:
        yield ans

In [33]:
ans = get_best_three('как оформить разовый пропуск для мамы')

In [34]:
next(ans)

array(['Оформить разовый пропуск можно предварительно заполнив форму',
       'https://docs.google.com/forms/d/e/1FAIpQLSdrcGv0ujVO9cHxP9zFggi9wSlLukUmM3G9PmSCxJyYam-ynA/viewform',
       'форму'], dtype=object)

In [35]:
ans = get_best_three('какой адрес учебного офиса')

In [36]:
next(ans)

array(['адрес учебного офиса', '123458, г. Москва, ул. Таллинская, д.34',
       nan], dtype=object)