In [5]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

import gensim
import gensim.downloader
from gensim.models import word2vec, KeyedVectors

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elyma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.read_csv("data/sample-data.csv")


In [7]:
df.head(10)


Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
5,6,Ascensionist jkt - Our most technical soft she...
6,7,"Atom - A multitasker's cloud nine, the Atom pl..."
7,8,Print banded betina btm - Our fullest coverage...
8,9,Baby micro d-luxe cardigan - Micro D-Luxe is a...
9,10,Baby sun bucket hat - This hat goes on when th...


### Чистим данные от мусора и стоп слов

In [8]:
def text_clear_from_garb(text_pd, stop_chars, stop_words):
    text_pd = text_pd.apply(
        lambda x: re.compile(r"https?://\S+|www\.\S+").sub(r" ", x)
    )  # clear urls
    text_pd = text_pd.apply(lambda x: re.compile(r"<.*?>").sub(r" ", x))  # clear html
    text_pd = text_pd.apply(lambda x: x.lower())  # lowercase
    text_pd = text_pd.apply(
        lambda x: "".join([ch for ch in x if ch not in stop_chars])
    )  # clear stop chard
    text_pd = text_pd.apply(
        lambda x: "".join([ch for ch in x if ch not in string.digits])
    )  # clear digits
    text_pd = text_pd.apply(lambda x: " ".join(x.split()))  # clear spase
    text_pd = text_pd.apply(
        lambda x: " ".join([word for word in str(x).split() if word not in stop_words])
    )
    return text_pd

In [9]:
stop_chars = string.punctuation
stop_words = stopwords.words("english")
df["description_clear"] = text_clear_from_garb(
    df["description"], stop_chars=stop_chars, stop_words=stop_words
)

### Функции для подбора наиболее похожих товаров

In [93]:
def item_name(item_id):
    # find name of item by cuting left size before first pattern (space-space)
    result = df.loc[df["id"] == item_id]["description"].tolist()[0].split(" - ")[0]
    return result


def recomendation(cos_matirx, item_id, num):
    # get cosine matrix, item_id and number of items to get recomendation
    index = df.index[df["id"] == item_id].to_list()[0]  # get index of item in cosine_matrix
    same_index = cos_matirx[index].argsort()[: -(num + 2) : -1]  # get index of same items
    print(f"Топ {num} товаров, похожих на {item_name(item_id)}, (id: {item_id})")
    for s_index in same_index[1:]:
        print(
            f"Товар {item_name(df['id'][s_index])}, уровень совпадения: {cos_matirx[index][s_index]}, (id: {s_index})"
        )


def get_same(cos_matirx, item_id, num):
    # return list of items to build out_file in csv
    index = df.index[df["id"] == item_id].to_list()[0]  # get index of item in cosine_matrix
    same_index = cos_matirx[index].argsort()[: -(num + 2) : -1]  # get index of same items
    out_list = [item_name(item_id)]
    for s_index in same_index[1:]:
        out_list.append(
            f"{item_name(df['id'][s_index])}|уровень: {cos_matirx[index][s_index]:5f}| id: {s_index}"
        )
    return out_list

### Тексты в TFIDF

In [11]:
# tokenazier initial
tfidf_vec = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 3),
    min_df=0,
    stop_words="english",
    binary=True,
    max_features=25000,
)

# embedings
tfidf_embed = tfidf_vec.fit_transform(df["description_clear"])

# check shape
tfidf_embed.shape

(500, 25000)

In [12]:
cos_matrix = linear_kernel(tfidf_embed, tfidf_embed)
ids = [3, 4, 7, 62]
num = 3
for id in ids:
    recomendation(cos_matirx=cos_matrix, item_id=id, num=num)
    print('----------')

Топ 3 товаров, похожих на Active sport briefs, (id: 3)
Товар Active sport boxer briefs, уровень совпадения: 0.5288292850724223, (id: 1)
Товар Active briefs, уровень совпадения: 0.1247423393742939, (id: 299)
Товар Active boy shorts, уровень совпадения: 0.11137538822540924, (id: 298)
----------
Топ 3 товаров, похожих на Alpine guide pants, (id: 4)
Товар Alpine guide pants, уровень совпадения: 0.900038434073977, (id: 158)
Товар Guide jkt, уровень совпадения: 0.1240670307783811, (id: 183)
Товар Rock guide pants, уровень совпадения: 0.11773548636640709, (id: 342)
----------
Топ 3 товаров, похожих на Atom, (id: 7)
Товар Pocket pack, уровень совпадения: 0.2901524646556511, (id: 103)
Товар Hip pack, уровень совпадения: 0.26797964888901704, (id: 402)
Товар Single shot, уровень совпадения: 0.2068941619576152, (id: 353)
----------
Топ 3 товаров, похожих на El cap jkt, (id: 62)
Товар El cap jkt, уровень совпадения: 0.17654939935627162, (id: 179)
Товар Synch marsupial, уровень совпадения: 0.1199857

### Тексты в vord2vec

In [13]:
word_embedings = gensim.downloader.load("glove-wiki-gigaword-300")

In [14]:
texts = df["description_clear"].values

embed_matrix = np.zeros((len(texts), word_embedings.vector_size))

for ind, text in enumerate(texts):
    mean_word2vec = np.zeros(word_embedings.vector_size)
    num_words = 0
    tokenized_text = text
    # get embeding of words in doc
    for word in tokenized_text:
        try:  # pass the error if word not in word_embedings
            mean_word2vec += word_embedings[word]
            num_words += 1
        except:
            pass
    embed_matrix[ind] = mean_word2vec / num_words
    # normalize final vector
    embed_matrix[ind] = embed_matrix[ind] / np.sqrt(np.sum(embed_matrix[ind] ** 2))

In [15]:
cos_matrixv2c = linear_kernel(embed_matrix, embed_matrix)
ids = [3, 4, 7, 62]
num = 3
for id in ids:
    recomendation(cos_matirx=cos_matrixv2c, item_id=id, num=num)
    print("----------")

Топ 3 товаров, похожих на Active sport briefs, (id: 3)
Товар Active sport boxer briefs, уровень совпадения: 0.999149554998185, (id: 1)
Товар Borderless shorts, уровень совпадения: 0.9991470523848172, (id: 328)
Товар Drift shirt, уровень совпадения: 0.9990215582293205, (id: 282)
----------
Топ 3 товаров, похожих на Alpine guide pants, (id: 4)
Товар Alpine guide pants, уровень совпадения: 0.9999279022317773, (id: 158)
Товар Rock guide pants, уровень совпадения: 0.9995143758238204, (id: 342)
Товар Nine trails shorts, уровень совпадения: 0.9993655161871461, (id: 96)
----------
Топ 3 товаров, похожих на Atom, (id: 7)
Товар Mlc, уровень совпадения: 0.9995422284133677, (id: 429)
Товар Crosstown, уровень совпадения: 0.9993000184782383, (id: 30)
Товар Mlc wheelie, уровень совпадения: 0.9992764819559367, (id: 93)
----------
Топ 3 товаров, похожих на El cap jkt, (id: 62)
Товар El cap jkt, уровень совпадения: 0.9993184357661502, (id: 179)
Товар Aravis jkt, уровень совпадения: 0.9992554058128482, (

### TF-IDF vs Word2Vec

In [74]:
ids = [3, 4, 7, 62]
num = 3
for id in ids:
    print("TF-IDF")
    recomendation(cos_matirx=cos_matrix, item_id=id, num=num)
    print("----------")
    print("Word2Vec")
    recomendation(cos_matirx=cos_matrixv2c, item_id=id, num=num)
    print("----------")

TF-IDF
Топ 3 товаров, похожих на Active sport briefs, (id: 3)
Товар Active sport boxer briefs, уровень совпадения: 0.5288292850724223, (id: 1)
Товар Active briefs, уровень совпадения: 0.1247423393742939, (id: 299)
Товар Active boy shorts, уровень совпадения: 0.11137538822540924, (id: 298)
----------
Word2Vec
Топ 3 товаров, похожих на Active sport briefs, (id: 3)
Товар Active sport boxer briefs, уровень совпадения: 0.999149554998185, (id: 1)
Товар Borderless shorts, уровень совпадения: 0.9991470523848172, (id: 328)
Товар Drift shirt, уровень совпадения: 0.9990215582293205, (id: 282)
----------
TF-IDF
Топ 3 товаров, похожих на Alpine guide pants, (id: 4)
Товар Alpine guide pants, уровень совпадения: 0.900038434073977, (id: 158)
Товар Guide jkt, уровень совпадения: 0.1240670307783811, (id: 183)
Товар Rock guide pants, уровень совпадения: 0.11773548636640709, (id: 342)
----------
Word2Vec
Топ 3 товаров, похожих на Alpine guide pants, (id: 4)
Товар Alpine guide pants, уровень совпадения: 0.

### Формируем файлы с предсказаниями

In [95]:
### TF-IDF file constuct

get_outs = list()
for id in df['id'].values:
    get_outs.append(get_same(cos_matirx=cos_matrix, item_id=id, num=num))

columns_list = ['Item_name']
for cols in range(num):
    columns_list.append(f'Item number {cols+1}')
tfidf_df = pd.DataFrame(
    data=get_outs,
    columns=columns_list
)
tfidf_df.to_csv('data/tfidf_out.csv', encoding='utf8')

In [96]:
### word2vec file constuct

get_outs = list()
for id in df['id'].values:
    get_outs.append(get_same(cos_matirx=cos_matrixv2c, item_id=id, num=num))

columns_list = ['Item_name']
for cols in range(num):
    columns_list.append(f'Item number {cols+1}')
tfidf_df = pd.DataFrame(
    data=get_outs,
    columns=columns_list
)
tfidf_df.to_csv('data/word2vec_out.csv', encoding='utf8')