[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Nina-Konovalova/WildHack_Tags/blob/main/specifying_tag_generation_naive_clustering.ipynb)


##### Installation

In [None]:
! pip install --upgrade spacy
! python -m spacy download ru_core_news_lg

##### Download

In [108]:
!gdown --id 1rzGMDi2KnFwGbWrq1yKnVZeV7s4fe2AU # unique queries from dataset for product 'капри'

Downloading...
From: https://drive.google.com/uc?id=1rzGMDi2KnFwGbWrq1yKnVZeV7s4fe2AU
To: /content/kapri_queries_unique.csv
  0% 0.00/19.7k [00:00<?, ?B/s]100% 19.7k/19.7k [00:00<00:00, 10.8MB/s]


##### Imports

In [109]:
import json

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans, AgglomerativeClustering
import spacy

##### Read data

In [110]:
kapri_queries = pd.read_csv('/content/kapri_queries_unique.csv')
kapri_queries.head()

Unnamed: 0,query
0,капри джинсовые
1,капри женские спортивные
2,капри женские домашние
3,капри женские
4,капри 68 р


##### Initial preprocessing

In [111]:
kapri_queries_clean = kapri_queries
kapri_queries_clean['query'] = kapri_queries['query'].apply(lambda x: x.lower()) # lowercase

##### Transform queries into vectors with the use of pre-trained embeddings

In [112]:
nlp = spacy.load('ru_core_news_lg')

In [113]:
print(nlp.pipe_names)

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [114]:
docs = list(nlp.pipe(kapri_queries_clean['query'].values, n_process=-1))

In [145]:
X = np.array([doc.vector / doc.vector_norm for doc in docs if doc.vector_norm])

model = AgglomerativeClustering(n_clusters=200, linkage="average", affinity='cosine').fit(X)

In [146]:
model.labels_

array([  6,   4,  27,   4, 155,   4, 184, 168,  64,   4,  23,  27,  36,
         4,  36,  58,  80,   4,   4,  24,   4,  11,  11, 193,   4,  90,
        48,  18, 151,  41,  35, 199,  10, 114,  36, 174,  72, 105,  28,
        10, 158, 141, 167,   3,  46,  44,  10,  14,   4, 133,  90,   4,
       159,  27,  52,   0,   3, 164,  17,  17,   4,   4, 112,   0,  37,
         4,   3,   2,  90,   1,   3,   3,  25, 185, 127,   3,   4,  80,
        80,  24,   7,  45,   9,   4,  10,   1, 143,  41,   9,   9,   9,
       148,  27, 145,   3,  80,  33,  99, 189,  40,  39,  26, 103,  20,
       181, 183, 138, 134,   4, 152, 129,  23, 120,  80,  83,  10,  48,
        45,   4, 144,   4,   4, 139,   3,   3,   4,  38, 190, 135,   3,
         3,  74, 140, 156,  30, 179,   4, 176,   3, 161,   3,   4,   4,
         4,   4,  76,  17,   4,   4,  25,  27,  52,  10,  10,  70, 194,
       192, 160,   2,   4,  12, 111,   4,   3,  17,   3, 169,   4, 175,
         2,  10,  70,  52,  50,   3, 131,   4,   4,   4,  64,  6

In [147]:
sizes = [(i, sum(model.labels_ == i)) for i in range(100)]
sorted(sizes, key=lambda x: x[1], reverse=True)[:5]

[(4, 65), (3, 38), (10, 22), (27, 11), (36, 11)]

In [153]:
vocab = np.array([str(doc) for doc in docs if doc.vector_norm])

In [149]:
cluster_i = 27
print(f'Cluster {cluster_i} with size {sum(model.labels_ == cluster_i)}:')
sorted(vocab[model.labels_ == cluster_i])

Cluster 27 with size 11:


['бриджи капри домашние',
 'брюки капри домашние',
 'домашние капри женские',
 'капри домашние',
 'капри домашние мужские',
 'капри домашние трикотажные',
 'капри женские домашние',
 'капри женскиееапри домашние',
 'капри легенцы домашние',
 'капри мужские домашние',
 'капри трикотаж домашние']

##### Generate top-5 tags for random query from list

In [200]:
query_id = np.random.randint(0, len(model.labels_))
query = [str(doc) for doc in docs if doc.vector_norm][query_id]
cluster_id = model.labels_[query_id]
'Query', query, 'Cluster', cluster_id

('Query', 'капри и блузка летние', 'Cluster', 9)

In [201]:
clarifying_tags = []

for i in range(model.n_clusters_):
    if i != query_id:
        t = np.random.choice(vocab[model.labels_ == i])
        clarifying_tags.append(t.lower())

In [203]:
def choose_top_k(tags, k=7):
    # the most similar
    return np.random.choice(tags, k)

'Query', query, 'Clarifying_tags', *choose_top_k(clarifying_tags)

('Query',
 'капри и блузка летние',
 'Clarifying_tags',
 'женские капри вельвет',
 'капри мальчику',
 'капри из льна',
 'капри для спорта с высокой посадкой',
 'портьера капри канвас',
 'купить капри женские',
 'ремешок для apple watch 44 капри blue')