## Installations

In [None]:
# !pip install -U spacy >=2.3.2

In [None]:
# !wget https://github.com/buriy/spacy-ru/releases/download/v2.3_beta/ru2_combined_400ks_96.zip
# !unzip ru2_combined_400ks_96.zip

In [None]:
!pip install pymorphy2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from google.colab import drive
drive.flush_and_unmount()

## Data extraction

In [None]:
import numpy as np
import pandas as pd

### query_popularity

In [None]:
cd /content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials

/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials


In [None]:
import pandas as pd
import numpy as np

query_popularity = pd.read_csv("/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/query_popularity.csv")

In [None]:
query_popularity.shape

(336988, 2)

### ru dataset with russian words

## Data preprocessing

### Spacy lemmanizer

In [None]:
import spacy
spacy.cli.download("ru_core_news_lg")

In [None]:
nlp = spacy.load("ru_core_news_lg")
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)
doc = nlp("Я купила красивую кофточку")
print([token.lemma_ for token in doc])

pymorphy2
['я', 'купить', 'красивый', 'кофточка']


### Snowball Stemmization

In [None]:
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='russian')
snowball.stem('Осенняя')


### Pymorphy2 lemmanizer

In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
morph.parse('фотоаппаратов')[0].normal_form

### Pymystem3

In [None]:
from pymystem3 import Mystem
mystem = Mystem() 

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


### Chosen preprocessing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import pymorphy2
nltk.download('punkt')

nltk.download('stopwords')

russian_stopwords = stopwords.words("russian")
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def data_preprocessing(text):

  text = text.lower()
  words = word_tokenize(text, language='russian')
  words = [word for word in words if not word in russian_stopwords]
  words = [morph.parse(word)[0].normal_form for word in words]
  return words

In [None]:
query_popularity['preprocessed_tokens'] = query_popularity['query'].apply(lambda x: data_preprocessing(x))

## Words ---> Embeddings

### Пример работы с эмбединнгами из лекций Панченко

In [None]:
!wget http://panchenko.me/slides/nnlp/data/cc.ru.300.vec.zip

In [None]:
!unzip cc.ru.300.vec.zip

Archive:  cc.ru.300.vec.zip
  inflating: cc.ru.300.vec           


In [None]:
import gensim
import numpy as np
from gensim.models import KeyedVectors

import requests

ru_emb = KeyedVectors.load_word2vec_format("cc.ru.300.vec")

In [None]:
ru_emb.most_similar([ru_emb["кофта"]], topn=10)

[('кофта', 1.0),
 ('кофточка', 0.8027787804603577),
 ('водолазка', 0.7804529666900635),
 ('блуза', 0.7505505084991455),
 ('курточка', 0.7494008541107178),
 ('блузка', 0.7484604120254517),
 ('рубашка', 0.7404817342758179),
 ('жилетка', 0.7383711338043213),
 ('туника', 0.7328645586967468),
 ('безрукавка', 0.7282662391662598)]

### Gensim + fasttext- рабочая модель

In [None]:
import gensim

In [None]:
# model download. For this example we will use fasttex pretrained model.
!wget http://vectors.nlpl.eu/repository/20/214.zip

--2021-11-14 04:26:07--  http://vectors.nlpl.eu/repository/20/214.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1920218982 (1.8G) [application/zip]
Saving to: ‘214.zip’


2021-11-14 04:27:24 (24.1 MB/s) - ‘214.zip’ saved [1920218982/1920218982]



In [None]:
!unzip 214.zip -d ru_fasttext_model

Archive:  214.zip
  inflating: ru_fasttext_model/meta.json  
  inflating: ru_fasttext_model/model.model  
  inflating: ru_fasttext_model/model.model.vectors_ngrams.npy  
  inflating: ru_fasttext_model/model.model.vectors.npy  
  inflating: ru_fasttext_model/model.model.vectors_vocab.npy  
  inflating: ru_fasttext_model/README  


In [None]:
ru_fasttext_model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/ru_fasttext_model/model.model')

In [None]:
ru_fasttext_model.get_vector("кофта")

array([-3.16673875e-01, -3.44897330e-01,  4.98045444e-01, -1.57779321e-01,
       -6.17443144e-01, -3.21173728e-01, -1.04907066e-01,  3.79439890e-01,
       -1.23112172e-01, -3.99725318e-01,  1.45621877e-02, -3.94380063e-01,
       -2.24752501e-01, -2.75740493e-02, -1.89597514e-02, -2.70162374e-01,
       -3.05854797e-01, -8.04790184e-02,  7.26092041e-01, -2.69159973e-01,
       -4.83959705e-01,  2.09891170e-01, -1.67212591e-01,  5.16246259e-01,
       -4.71317954e-02, -8.39082003e-01,  1.46864861e-01, -1.10908076e-01,
        4.87202823e-01,  1.52310997e-01,  5.88977098e-01, -2.81061918e-01,
       -2.14366186e-02, -3.93366665e-01, -6.96267039e-02, -4.12223995e-01,
        2.22060591e-01, -1.57052681e-01, -1.45617858e-01,  8.75454396e-02,
       -2.16176212e-01, -1.75142661e-01, -1.50720969e-01, -1.61035076e-01,
       -1.24751791e-01, -1.32906690e-01, -4.53682780e-01, -8.44563171e-02,
        1.25200972e-01,  3.62179697e-01, -3.63375366e-01,  3.64154190e-01,
        5.48803881e-02, -

In [None]:
tags = ru_fasttext_model.most_similar(positive=["офис", "набор"])

In [None]:
tags

[('комплект', 0.6443444490432739),
 ('наборчик', 0.6285516023635864),
 ('магазин', 0.5845011472702026),
 ('комлект', 0.5763264894485474),
 ('корзинунабор', 0.5691683292388916),
 ('минимаркет', 0.5665059685707092),
 ('склад-магазин', 0.5617548227310181),
 ('офисное', 0.5520975589752197),
 ('инструментарий', 0.5514048933982849),
 ('продуктовый', 0.5509083271026611)]

In [None]:
tags

[('демисезонная', 0.8067998290061951),
 ('курточка', 0.7757225036621094),
 ('толстовка', 0.7682155966758728),
 ('жилетка', 0.7641149163246155),
 ('ветровка', 0.7575536370277405),
 ('рубашка', 0.7310628294944763),
 ('куртки', 0.7197908163070679),
 ('ушанка', 0.7146055102348328),
 ('шапка', 0.7133417129516602),
 ('меховая', 0.7121798992156982)]

### Download fasttext pre-trained models

In [None]:
# !wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_lemmatize/ft_native_300_ru_wiki_lenta_lemmatize.bin

In [None]:
# !wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin

### Fasttext установка

In [None]:
! wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip

In [None]:
cd fastText-0.9.2

/content/drive/My Drive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/fastText-0.9.2


In [None]:
!make

In [None]:
!pip install .

### Fasttext models

In [None]:
cd /content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/fastText-0.9.2

/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/fastText-0.9.2


In [None]:
model1 = fasttext.load_model("/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/fasttext_bins/ft_native_300_ru_twitter_nltk_word_tokenize.bin")

In [None]:
model1.get_nearest_neighbors("кошка")

[(0.886544942855835, 'собака'),
 (0.8334879279136658, 'собачка'),
 (0.8285769820213318, 'кошка.'),
 (0.8281669020652771, 'Кошка'),
 (0.8268365263938904, 'коткошка'),
 (0.8147615194320679, 'кошечка'),
 (0.7972818613052368, 'кошка,'),
 (0.7970927953720093, ',кошка'),
 (0.7897952795028687, 'соседская'),
 (0.783193826675415, 'черепаха')]

In [None]:
cat_vec = model1.get_word_vector('кошка')
dog_vec = model1.get_word_vector('собака')
kitty_vec = model1.get_word_vector('кошeчка')

In [None]:
from scipy import spatial
1 - spatial.distance.cosine(cat_vec, dog_vec)

In [None]:
1 - spatial.distance.cosine(cat_vec, dog_vec)

0.8865448832511902

In [None]:
1 - spatial.distance.cosine(model1.get_word_vector('кофта'), model1.get_word_vector('кофточка'))

0.8967674970626831

In [None]:
query_popularity.dropna(inplace=True)

In [None]:
query_popularity['word2vec'] = query_popularity['query'].apply(lambda x: model1.get_word_vector(x))

## Find tags

### Download dataset with preprocessed tokens

In [None]:
from scipy import spatial
import numpy as np
import pandas as pd
# import fasttext

In [None]:
query_popularity = pd.read_pickle('/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/query_popularity_preprocessed_word2vec.pkl')

### Delete english and digits



In [None]:
import string

new_data = query_popularity[query_popularity['query'] >= 'а' and query_popularity['query'] >= 'z']
english_array = string.ascii_uppercase+string.ascii_lowercase + string.digits
eng = [i for i in english_array]

all_tokens = []
cnt = 0
for token in new_data['preprocessed_tokens']:
  for t in token:
    cnt = 0
    for letter in t:
      if letter in eng:
        cnt += 1
    if cnt == 0:
      all_tokens.append(t)

In [None]:
all_tokens = pd.Series(list(set(all_tokens)))

In [None]:
token2vec = []
token = list(set(all_tokens))
M = len(token)

for i in range(M):
  try:
    token2vec.append(ru_fasttext_model.get_vector(token[i]))
  except:
    token2vec.append(None)

In [None]:
tok2vec = pd.Series(token2vec)

In [None]:
series = { 'token': all_tokens, 'token2vec': tok2vec }
dataframe = pd.DataFrame(series)

In [None]:
dataframe.dropna(inplace=True)

In [None]:
# dataframe.drop(['cluster'],axis=1,inplace=True)

In [None]:
dataframe

Unnamed: 0,token,token2vec
1,марни,"[0.16677357, 0.08101124, 0.22967526, -0.233135..."
2,фитинг,"[-0.44259387, -0.13088427, 0.3603452, 0.489628..."
4,камаз,"[-0.3917158, -0.029374126, 0.24775603, -0.1479..."
6,ёлочка,"[-0.0111735985, 0.1611769, 0.1230761, -0.35288..."
9,нары,"[0.17737818, -0.21898988, 0.10689692, 0.019172..."
...,...,...
38087,суфле,"[-0.112649165, -0.5916668, 0.23877941, -0.2266..."
38089,рай,"[-0.0038947803, -0.053480785, 0.08424123, 0.26..."
38090,ёрш,"[-0.420692, 0.34631017, -0.41336983, 0.1567766..."
38091,цвейг,"[-0.058493577, 0.023739407, 0.082857296, -0.03..."


In [None]:
dataframe.to_pickle("/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/token2vec_Assel.pkl")

### token2vec to np.array

##### dataframe 1

In [None]:
import pandas as pd
dataframe = pd.read_pickle('/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/token2vec_Assel.pkl')

In [None]:
dataframe

Unnamed: 0,token,token2vec
1,марни,"[0.16677357, 0.08101124, 0.22967526, -0.233135..."
2,фитинг,"[-0.44259387, -0.13088427, 0.3603452, 0.489628..."
4,камаз,"[-0.3917158, -0.029374126, 0.24775603, -0.1479..."
6,ёлочка,"[-0.0111735985, 0.1611769, 0.1230761, -0.35288..."
9,нары,"[0.17737818, -0.21898988, 0.10689692, 0.019172..."
...,...,...
38087,суфле,"[-0.112649165, -0.5916668, 0.23877941, -0.2266..."
38089,рай,"[-0.0038947803, -0.053480785, 0.08424123, 0.26..."
38090,ёрш,"[-0.420692, 0.34631017, -0.41336983, 0.1567766..."
38091,цвейг,"[-0.058493577, 0.023739407, 0.082857296, -0.03..."


#### dataframe 2

In [None]:
import numpy as np

reqs = np.load('/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/words.npy')

In [None]:
'Я' > 'я'

False

In [None]:
reqs

array(['!', '#', '$', ..., '🦊', '🧸🤖', '\U0001faa5'], dtype='<U197')

In [None]:
'!' > 'А'

False

In [None]:
import string

# new_data = query_popularity[query_popularity['query'] >= 'а' and query_popularity['query'] <= 'z']
# english_array = string.ascii_uppercase+string.ascii_lowercase + string.digits

# eng = [i for i in english_array]


all_tokens = []

for token in reqs:
  cnt = 0
  for letter in token:
    if letter < 'А' or letter > 'я':
      cnt += 1
  if cnt == 0:
    all_tokens.append(t)

In [None]:
all_tokens

['\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001faa5',
 '\U0001fa

In [None]:
all_tokens = pd.Series(list(set(all_tokens)))

In [None]:
len(all_tokens)

1

In [None]:
all_tokens

0    🪥
dtype: object

#### preprocess

In [None]:
import numpy as np
dataframe = dataframe.reset_index(drop=True)
token2vec_np = [np.array(dataframe.token2vec[i]) for i in range (len(dataframe.token2vec))]
X = np.array(token2vec_np)

### create token2vec for preprocessed_tokens column

In [None]:
query_popularity['word2vec_ru_fasttext'] = query_popularity['preprocessed_tokens']

In [None]:
query_popularity['preprocessed_tokens'][14]

['poco', 'f3']

In [None]:
N = len(query_popularity['preprocessed_tokens'])
bad_words = []
for i in range(N):
    tokens = query_popularity['preprocessed_tokens'][i]
  try:
    query_popularity['word2vec_ru_fasttext'][i] = [ru_fasttext_model.get_vector(word) for word in tokens]
  except:
    query_popularity['word2vec_ru_fasttext'][i] = None

In [None]:
query_popularity

Unnamed: 0,query,query_popularity,word2vec,preprocessed_tokens
0,ноутбук,10,"[-0.31533813, 0.22640598, -0.14456888, 0.78250...",[ноутбук]
1,куртка женская осенняя,10,"[0.12441584, 0.018864037, -0.5093914, 0.273392...","[куртка, женский, осенний]"
2,ботинки женские,10,"[0.11589693, -0.2578671, -0.5909001, -0.186843...","[ботинок, женский]"
3,видеокарта,10,"[0.68164533, 0.5142049, -0.023444502, 0.700806...",[видеокарта]
4,пальто женское осеннее,10,"[0.45430318, 0.11048531, -0.25905058, 0.166942...","[пальто, женский, осенний]"
...,...,...,...,...
336983,конструктор робо-динозавры,1,"[0.22063358, -0.33766633, -0.4480671, 0.038485...","[конструктор, робо-динозавр]"
336984,чика,1,"[0.3406589, 1.006835, -0.5743846, 0.5203182, -...",[чик]
336985,Дешевый товар,1,"[0.12944888, 0.33708918, -0.29052824, 0.028786...","[дешёвый, товар]"
336986,макса,1,"[0.3828721, 0.33940908, 0.03207649, 0.06045904...",[макс]


In [None]:
query = 'очки'

In [None]:
query = data_preprocessing(query)

In [None]:
query

['очки', 'солнцезащитный']

In [None]:
query2vec = [ru_fasttext_model.get_vector(word) for word in query]

In [None]:
tags = ru_fasttext_model.most_similar(positive=query)

In [None]:
ru_fasttext_model.most_similar('очки')

[('солнцезащитные', 0.7249504327774048),
 ('очков', 0.6945763826370239),
 ('очковые', 0.6848100423812866),
 ('линзы', 0.6669591069221497),
 ('шлемы', 0.6518011093139648),
 ('оправы', 0.6515406370162964),
 ('диоптриями', 0.6498522758483887),
 ('линзами', 0.6361788511276245),
 ('-очки', 0.6304699182510376),
 ('очками', 0.59984290599823)]

In [None]:
query_popularity['word2vec'][0]

array([-0.31533813,  0.22640598, -0.14456888,  0.78250855,  0.5476514 ,
       -0.04088544, -0.68912655, -0.49177408,  0.09105945, -0.43312305,
       -0.24829242,  0.5405648 , -0.43978173,  0.74732304, -0.2860573 ,
        0.2344339 , -0.15396622, -0.05094039,  0.45775563, -0.40490985,
       -0.71636796, -0.6566225 ,  0.30242717,  0.25560272,  0.06083553,
       -0.37715924,  0.6778109 , -1.5544797 , -0.5901519 , -0.08893455,
        0.28033814, -0.2698118 , -0.4821361 , -0.9521838 , -0.6686404 ,
       -1.0886762 ,  0.06134065,  0.11470045, -0.28851038, -0.25580743,
        0.3800123 ,  0.5221255 ,  0.54226017,  0.60789156, -0.6147588 ,
        0.345372  ,  0.42067587,  0.16558208,  0.20012994, -0.07414536,
        0.20856738,  0.01317644,  0.45680052, -0.31978396, -0.66548604,
        0.33151245,  0.25475678,  0.13717723,  0.11737997,  0.37827417,
        0.47148255, -0.7741835 , -0.49492162, -0.33481205,  0.90838087,
       -0.8118403 , -0.06270419,  0.09344159,  0.52947557,  0.28

In [None]:
query2vec[0].shape

(300,)

In [None]:
vec2.shape

(100,)

In [None]:
cosine_similarity = []
for i in range(len(query_popularity['word2vec'])):
    name = query_popularity['query'][i]
    vec2 = query_popularity['word2vec'][i]
    cos_sim = 1 - spatial.distance.cosine(query2vec[0], vec2)
    cosine_similarity.append([name,vec2])

In [None]:
model1 = fasttext.load_model("/content/drive/MyDrive/Skoltech/Padre_Pio_XIII/WildHack/Assel_materials/fasttext_bins/ft_native_300_ru_twitter_nltk_word_tokenize.bin")

def cosine_similarity(query,df):

    final_tags = []
    query = data_preprocessing(query)
    query2vec = ru_fasttext_model.get_vector(query)
    # ru_fasttext_model.most_similar([ru_emb["кофта"]], topn=10)
    tags = ru_fasttext_model.most_similar(positive=query)
    cosine_similarity = []
    for i in range(len(df['word2vec'])):
        name = df['query'][i]
        vec2 = df['word2vec'][i]
        cos_sim = 1 - spatial.distance.cosine(query, vec2)
        cosine_similarity.append([name,])




    return 

In [None]:
query = 'праздничная кофта'

### cluster creation

#### kmeans clustering

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=50, random_state=0).fit(X)

#### birch

In [None]:
from sklearn.cluster import Birch
model = Birch(n_clusters=50).fit(X)

#### create cluster column

In [None]:
dataframe['cluster'] = pd.Series(np.array(model.labels_))

In [None]:
dataframe.cluster.value_counts()

15    853
3     785
7     624
4     590
22    559
6     558
19    541
2     498
27    498
25    444
31    430
43    420
23    412
18    396
12    382
1     377
10    375
26    358
13    348
8     343
45    342
5     335
37    317
30    316
14    314
35    300
33    284
34    271
9     266
49    254
21    247
32    236
44    219
28    217
16    216
0     191
17    186
20    185
24    180
11    174
47    173
29    167
38    167
36    160
39    153
42    124
46    121
41    117
48    112
40     99
Name: cluster, dtype: int64

### find tags in clusters

In [None]:
def cluster_define(model, word):
  words = data_preprocessing(word)

  words_embeddings = [ru_fasttext_model.get_vector(word) for word in words]
  cluster_num = []
  for emb in words_embeddings:
    cluster_num.append(model.predict(emb.reshape(1,-1)))

  return cluster_num, words, words_embeddings

In [None]:
def find_tags(cluster_num,words,words_embeddings):
  tags = {}
  for i in range(len(cluster_num)):
    tags[words[i]] = [words_embeddings[i]]
    tags[words[i]].append(dataframe[dataframe['cluster']== cluster_num[i][0]].loc[:,['token','token2vec']])
  return tags

In [None]:
from scipy import spatial

def cosine_similarity(vec1,vec2):
  return 1 - spatial.distance.cosine(vec1, vec2)

def process_tags(tags):

  for word in tags.keys():
    word_embedding = tags[word][0]
    df = tags[word][1]
    df['cosine'] = df['token2vec'].apply(lambda x: cosine_similarity(word_embedding,x))

    tag = np.array(df.sort_values(by='cosine',ascending=False).loc[:,'token'].head(5))
    print(tag)


In [None]:
cluster_num,words,words_embeddings =cluster_define(brc,'кофточка нарядная')
tags = find_tags(cluster_num,words,words_embeddings)

In [None]:
process_tags(tags)

['ботитнка' 'кофта' 'вечинница' 'сладкийбокс' 'туника']
['наружное' 'изящный' 'красивый' 'шархан' 'немой']


In [None]:
def give_tags(kmeans,query):

  cluster_num,words,words_embeddings =cluster_define(kmeans,query)
  tags = find_tags(cluster_num,words,words_embeddings)

  return process_tags(tags)

## Final whole tag creation process. 

To run the whole cells, you need:

1. Fasttext model installation and load
2. Pymorphy installation
3. Load clustering model and dataframe from Github: "assel_dataframe.pkl","assel_kmeans_model.pkl"



#### Prestep

##### Fasttext model install and load

In [4]:
!pip install gensim



In [5]:
!wget http://vectors.nlpl.eu/repository/20/214.zip
!unzip 214.zip -d ru_fasttext_model

--2021-11-14 12:23:35--  http://vectors.nlpl.eu/repository/20/214.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1920218982 (1.8G) [application/zip]
Saving to: ‘214.zip’


2021-11-14 12:24:51 (24.3 MB/s) - ‘214.zip’ saved [1920218982/1920218982]

Archive:  214.zip
  inflating: ru_fasttext_model/meta.json  
  inflating: ru_fasttext_model/model.model  
  inflating: ru_fasttext_model/model.model.vectors_ngrams.npy  
  inflating: ru_fasttext_model/model.model.vectors.npy  
  inflating: ru_fasttext_model/model.model.vectors_vocab.npy  
  inflating: ru_fasttext_model/README  


In [6]:
path_to_model = 'ru_fasttext_model/model.model'

##### Install required libraries

In [None]:
!pip install pymorphy2
!pip install pickle

#### Main process

In [26]:
import numpy as np
import pickle
import pandas as pd
import gensim
from scipy import spatial
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import pymorphy2
nltk.download('punkt')
nltk.download('stopwords')

In [23]:
def data_preprocessing(text):

  russian_stopwords = stopwords.words("russian")
  morph = pymorphy2.MorphAnalyzer()

  text = text.lower()
  words = word_tokenize(text, language='russian')
  words = [word for word in words if not word in russian_stopwords]
  words = [morph.parse(word)[0].normal_form for word in words]
  return words

def cluster_define(model, word):
  words = data_preprocessing(word)

  words_embeddings = [ru_fasttext_model.get_vector(word) for word in words]
  cluster_num = []
  for emb in words_embeddings:
    cluster_num.append(model.predict(emb.reshape(1,-1)))

  return cluster_num, words, words_embeddings

def find_tags(cluster_num,words,words_embeddings):
  tags = {}
  for i in range(len(cluster_num)):
    tags[words[i]] = [words_embeddings[i]]
    tags[words[i]].append(dataframe[dataframe['cluster']== cluster_num[i][0]].loc[:,['token','token2vec']])
  return tags


def cosine_similarity(vec1,vec2):
  return 1 - spatial.distance.cosine(vec1, vec2)

def process_tags(tags):

  for word in tags.keys():
    word_embedding = tags[word][0]
    df = tags[word][1]
    df['cosine'] = df['token2vec'].apply(lambda x: cosine_similarity(word_embedding,x))

    tag = np.array(df.sort_values(by='cosine',ascending=False).loc[:,'token'].head(5))
    print(tag)

def give_tags(model,query):

  cluster_num,words,words_embeddings =cluster_define(model,query)
  tags = find_tags(cluster_num,words,words_embeddings)

  return process_tags(tags)

In [7]:
ru_fasttext_model = gensim.models.KeyedVectors.load(path_to_model)
dataframe = pd.read_pickle('/content/dataframe.pkl')
model = pickle.load(open('kmeans_model.pkl', 'rb'))

In [27]:
tags = give_tags(model,'свитер')

['галантус' 'свитерок' 'вязаный' 'пуловер' 'джемпер']


In [28]:
tags = give_tags(model,'джинсы')

['пастеризовать' 'брюки' 'шорты' 'пиджак' 'джинсовый']


In [29]:
tags = give_tags(model,'канцелярские')

['канцелярский' 'клей-карандаш' 'бодитон' 'пенал' 'фломастер']
