## Часть 1. Постановка задачи

Задачей проекта является сформировать список id товаров, максимально похожих на него по описанию.

**План выполнения проекта**

1. Представить массив текстов в виде векторов
2. Выбрать расстояние, с помощью которого можно оценивать близость векторов
3. С помощью матричных операций попарной найти выбранное расстояние
4. Задать порог значимости, с которым можно сравнить полученные значения

In [12]:
import pandas as pd
import gensim
import gensim.downloader

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

import re #РЕГУЛЯРНЫЕ ВЫРАЖЕНИЯ

import nltk
from nltk.corpus import stopwords

In [7]:
data = pd.read_csv('sample-data.csv')
data.head(3)

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...


## Часть  2. Предобработка текста

In [8]:
data['description'] = data['description'].apply(lambda x: x.lower())
data['description'] = data['description'].apply(lambda x: re.sub('[^a-z \n]', '', x))
data.description.head(3)

0    active classic boxers  theres a reason why our...
1    active sport boxer briefs  skinning up glory r...
2    active sport briefs  these superbreathable nof...
Name: description, dtype: object

## Часть 3. Векторное представление текстов

### 3.1 С помощью TF-IDF

In [21]:
tfidf = TfidfVectorizer(stop_words = 'english')
text_embeddings = tfidf.fit_transform(data['description']) #МАТРИЦА
cosine_sim = linear_kernel(text_embeddings, text_embeddings) #ВЕКТОРЫ ПРИШЛИ НОРМАЛИЗОВАННЫМИ

In [22]:
#cosine_data = pd.DataFrame(pd.DataFrame(cosine_sim).unstack().sort_values(ascending = False))
cosine_data = pd.DataFrame(cosine_sim,
                           index = data.id.values,
                           columns = data.id.values) #ИНДЕКСАЦИЯ В СООТВЕТСВИИ С ID ДОК-ТА
cosine_data.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
1,1.0,0.224196,0.076623,0.086477,0.067941,0.045773,0.032097,0.002326,0.077415,0.006217,...,0.049025,0.022231,0.030805,0.304134,0.253021,0.217617,0.173909,0.074011,0.083066,0.053392
2,0.224196,1.0,0.454095,0.061135,0.091318,0.030321,0.064155,0.00198,0.051693,0.01601,...,0.028843,0.018923,0.053741,0.201276,0.163699,0.150125,0.071095,0.044397,0.107665,0.047847
3,0.076623,0.454095,1.0,0.05685,0.082491,0.037663,0.077258,0.007546,0.059098,0.010667,...,0.032706,0.021458,0.029733,0.128791,0.231893,0.091027,0.086333,0.050344,0.039301,0.028267


In [23]:
#ALPHA = порог значимости
def get_similar(doc, alpha = 0.2):
    #documents = enumerate(cosine_data.loc[doc], start=1)
    #sorted_documents = sorted(documents, key = lambda x: x[1], reverse = True)
    filtred_documents = cosine_data.loc[doc][cosine_data.loc[doc] > alpha]
    sorted_documents = filtred_documents.sort_values(ascending = False)
    similar_documents = sorted_documents[1:] #НУЛЕВУЮ НЕ БЕРЁМ = ТОТ ЖЕ ТОВАР
    return similar_documents.index

In [24]:
get_similar(2) #ДЕМОНСТРАЦИЯ РАБОТЫ

Int64Index([3, 300, 19, 1, 494], dtype='int64')

In [25]:
cosine_data.loc[2][cosine_data.loc[2] > 0.2] #ПРОВЕРКА

1      0.224196
2      1.000000
3      0.454095
19     0.224473
300    0.241254
494    0.201276
Name: 2, dtype: float64

### 3.2 С помощью Word2Vec

In [11]:
embeddings = gensim.downloader.load('word2vec-google-news-300') #ЗАГРУЗКА МОДЕЛИ

words_df = pd.DataFrame()
docs_df = pd.DataFrame()

for document in data.description:
    for word in document.split():
        if word not in stop_words:
            try:
                word_vector = embeddings[word]
            except:
                pass
            words_df = words_df.append(pd.Series(word_vector), ignore_index = True)
    doc_vector = words_df.mean() #Series
    docs_df = docs_df.append(doc_vector, ignore_index = True)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

docs_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.050314,0.063278,-0.000972,0.082116,-0.05386,0.026812,0.067224,-0.093942,0.088861,0.140496,...,-0.0967,-0.015636,-0.051827,-0.030791,0.023613,-0.031848,0.081853,-0.047871,0.026281,0.035637
1,-0.060889,0.073999,-0.010183,0.068959,-0.075175,0.012902,0.074692,-0.059056,0.066102,0.109615,...,-0.0781,-0.0197,-0.043466,-0.029823,-0.004623,-0.015955,0.047257,-0.049199,0.018939,0.036264
2,-0.065388,0.085461,-0.024413,0.073966,-0.076954,0.005202,0.056151,-0.07129,0.063624,0.113803,...,-0.070393,-0.01345,-0.036206,-0.024904,0.003925,-0.016551,0.070185,-0.062012,0.021999,0.03307


In [14]:
cosine_sim = cosine_similarity(docs_df, docs_df)
cosine_data = pd.DataFrame(cosine_sim,
                           index = data.id.values,
                           columns = data.id.values) #ИНДЕКСАЦИЯ В СООТВЕТСВИИ С ID ДОК-ТА
cosine_data.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
1,1.0,0.965455,0.945284,0.940223,0.922498,0.917556,0.908886,0.907721,0.909651,0.908301,...,0.911868,0.911926,0.911972,0.912114,0.912238,0.912308,0.912335,0.91237,0.912382,0.912408
2,0.965455,1.0,0.988583,0.978624,0.961214,0.954965,0.94576,0.944747,0.945607,0.944427,...,0.939139,0.939194,0.939212,0.939339,0.939463,0.939548,0.9396,0.939615,0.939639,0.939663
3,0.945284,0.988583,1.0,0.98719,0.969165,0.961947,0.949472,0.94854,0.949523,0.947316,...,0.937191,0.937259,0.937285,0.93741,0.937553,0.937653,0.93773,0.937746,0.937766,0.937791


In [18]:
def get_similar(doc, alpha = 0.95):
    #documents = enumerate(cosine_data.loc[doc], start=1)
    #sorted_documents = sorted(documents, key = lambda x: x[1], reverse = True)
    filtred_documents = cosine_data.loc[doc][cosine_data.loc[doc] > alpha]
    sorted_documents = filtred_documents.sort_values(ascending = False)
    similar_documents = sorted_documents[1:] #НУЛЕВУЮ НЕ БЕРЁМ = ТОТ ЖЕ ТОВАР
    return similar_documents.index

In [19]:
get_similar(2) #ДЕМОНСТРАЦИЯ РАБОТЫ

Int64Index([3, 4, 1, 5, 6], dtype='int64')

In [20]:
cosine_data.loc[2][cosine_data.loc[2] > 0.95] #ПРОВЕРКА

1    0.965455
2    1.000000
3    0.988583
4    0.978624
5    0.961214
6    0.954965
Name: 2, dtype: float64

## Выводы

Результатом работы проекта является функция, которая получает на вход id товара, а на выходе выдаёт список id товаров, максимально похожих на него по описанию. За расстояние, по которому можно оценить близость векторов, было выбрано косинусное расстояние. За порог значимости alpha соответственно для инструмента tf-idf был выбран alpha = 0.2, а для Word2Vec был выбран alpha = 0.95 .