In [62]:
import os
import pandas as pd
import plotly.express as px
import progressbar
import json
import numpy as np
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from functions import open_product_zip_file, files_to_df_extended, delete_empty_rows
from functions import remove_punctuations, extract_most_similar
from nltk.tokenize import word_tokenize
pd.set_option('display.max_colwidth', None)

## Get the paths and build table names for iteration

In [10]:
data_path = '../src/data'
mapping_corpus_path = data_path + r'/product/lspc2020_to_tablecorpus'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
table_corpus_path = data_path + r'/product/product_top100/cleaned'
table_corpus_path_with_id = data_path + r'/product/product_top100/cleaned/with_id'
table_corpus_path2 = data_path + r'/product/product_minimum3/cleaned/with_id'

In [11]:
zip_files_mapping = [file for file in os.listdir(mapping_corpus_path_2) if file.endswith('.json.gz')]
zip_files_tables = [file for file in os.listdir(table_corpus_path) if file.endswith('.json.gz')]

## Match the number dictionaries with the information about the brand

In [12]:
df_joined_electronics = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_electronics'), compression='gzip', orient='records', lines=True)

In [114]:
df_joined_clothes = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_clothes'), compression='gzip', orient='records', lines=True)

In [15]:
# df_tables_count=df_join.groupby('cluster_id').count()
# df_tables_count[df_tables_count['brand']>2]

## Get information about electronic clusters and train model

In [16]:
df_grouped_electronics = df_joined_electronics.groupby('cluster_id').count()
# only look at clusters that have at least one brand associated
df_set_electronics = df_grouped_electronics[df_grouped_electronics['brand']>0].reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [18]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there, so 1,6 million clusters remain
df_set_electronics=df_set_electronics[df_set_electronics['Amount']>1]
df_15_electronics=df_set_electronics[df_set_electronics['Amount']>15]
df_15_electronics

Unnamed: 0,cluster_id,Amount
0,419,6
3,968,4
4,985,14
5,997,3
6,1040,21
...,...,...
104491,80393061,2
104517,80418008,3
104529,80434638,4
104532,80436986,2


In [89]:
#merge brand name to cluster amount
df_cluster_brand = df_15_electronics[df_15_electronics['Amount']<200].merge(df_joined_electronics.dropna()[['cluster_id','brand']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand

Unnamed: 0,cluster_id,Amount,brand
0,1040,21,nikon
1,6076,19,canon
2,6443,25,canon
3,6505,16,apple
4,6690,20,cisco
...,...,...,...
326,76869317,60,samsung
327,78260404,80,samsung
328,79512161,16,xiaomi
329,80097153,60,samsung


In [229]:
df_joined_electronics[df_joined_electronics['cluster_id']==9046]

Unnamed: 0,cluster_id,url,row_id,table_id,Valid,brand,product_name
3588418,9046,https://www.bevmo.com/shop/wine/dessert_sherry_and_port/port/warre_s_otima_10_year_tawny_port_500_ml/p/1564405684704479965,6535,Product_bevmo.com_September2020.json.gz,1,,1564405684704479965
3735570,9046,https://www.cclonline.com/product/250247/YD260XBCAFBOX/CPU-Processors/AMD-2nd-Gen-Ryzen-5-2600X-3-6GHz-Processor-16MB-L3-Cache-with-Wraith-Spire-Boxed-/CPU0538,996,Product_cclonline.com_September2020.json.gz,1,,cpu0538
5247552,9046,https://www.scan.co.uk/products/amd-ryzen-5-2600x-am4-zenplus-6-core-12-thread-36ghz-425ghz-turbo-19mb-cache-95w-cpu-retail-plus-wra,3658,Product_scan.co.uk_September2020.json.gz,1,amd,amd ryzen 5 2600x am4 zenplus 6 core 12 thread 36ghz 425ghz turbo 19mb cache 95w cpu retail plus wra
6831661,9046,https://www.informaticamurciapc.com/procesadores/4004-cpu-amd-ryzen-5-2600x-am4-0730143309226.html,231,Product_informaticamurciapc.com_September2020.json.gz,1,,4004 cpu amd ryzen 5 2600x am4 0730143309226
6958564,9046,https://www.njwineseller.com/products/13267950/warre-s-otima-tawny-port-10-year-old,461,Product_njwineseller.com_September2020.json.gz,1,,warre s otima tawny port 10 year old
7888890,9046,https://www.bedertec.com.ar/prod/90-ryzen-5-2600x,110,Product_bedertec.com.ar_September2020.json.gz,1,,90 ryzen 5 2600x
8183469,9046,https://samspc.com/products/amd-ryzen,163,Product_samspc.com_September2020.json.gz,1,,amd ryzen
8385143,9046,https://www.bottleshop.com/products/3338608/warre-s-otima-tawny-port-10-year-old,592,Product_bottleshop.com_September2020.json.gz,1,,warre s otima tawny port 10 year old
8666806,9046,https://www.minipriceexpress.com/informatica/procesadores/procesador-amd-ryzen-5-2600x-425ghz-am4/172199/0,152,Product_minipriceexpress.com_September2020.json.gz,1,,0
8671708,9046,https://www.bustersliquors.com/products/10089069/warre-s-otima-tawny-port-10-year-old,2322,Product_bustersliquors.com_September2020.json.gz,1,,warre s otima tawny port 10 year old


In [23]:
#get a new column with the product names from the url
df_joined_electronics['product_name'] = df_joined_electronics['url'].apply(lambda row: row.split("/")[-1])
#clean product column and lowercase
df_joined_electronics['product_name'] = df_joined_electronics['product_name'].str.replace('+',' ').str.replace('-',' ').str.replace('.html', '').apply(lambda row: row.lower())
df_joined_electronics
#get only cluster ids with at least one brand electronics
df_compare_electronics = df_joined_electronics[df_joined_electronics['cluster_id'].isin(df_set_electronics['cluster_id'].tolist())]
#merge with set to get amount of tables per cluster in overview
df_compare_electronics = df_compare_electronics.merge(df_set_electronics, left_on=['cluster_id'], right_on = ['cluster_id'], how='left')

In [115]:
#use tokenizer for product names to get tokes for training the model
df_compare_electronics['product_tokes'] = df_compare_electronics['product_name'].apply(lambda row: word_tokenize(row))
#get tagged words
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df_compare_electronics['product_tokes'])]
# build model and vocabulary
model = Doc2Vec(vector_size=50, min_count = 5, epochs = 25, dm = 0)
model.build_vocab(tagged_data)
# Train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=25)

In [142]:
# # get top5 similar entities of tag 'xxx'
# print(tagged_data[772])
# similar_doc = model.docvecs.most_similar('772', topn = 15)

# similar_doc

In [143]:
# index_list = []
# for index, similarity in similar_doc:
#     index = int(index)
#     index_list.append(index)
# df_compare_electronics.iloc[index_list]
#cluster: 47566,39040,197396,1850486,266916

In [None]:
#get cluster ids and with that indices of top products to use model
top_clusters_list = [1524820,47566,6076,6505,6690,9046,14418,28307,33570,39040,51314,99153,59744,153965,215254,685416, 740038, 984421 , 1874288,1808651,2887810,4044656,5327857, 22374889,34506065,47841827 ]
index_top_clusters_list=[]
for id in top_clusters_list:
    index_top_clusters_list.append(df_compare_electronics[df_compare_electronics['cluster_id']==id].index[0])

In [225]:
# get most similar products for each of the base clusters and save them if they have more than ten tables
electronics_clusters_search=[]
for i in index_top_clusters_list:
    similar_doc = model.docvecs.most_similar(f'{i}', topn = 15)
    electronics_clusters_search.append(int(i))
    for index, similarity in similar_doc:
        if df_compare_electronics.iloc[int(index)]['Amount']>5:
            electronics_clusters_search.append(int(index))
df_electroncis_final = df_compare_electronics.iloc[electronics_clusters_search]

  similar_doc = model.docvecs.most_similar(f'{i}', topn = 15)


Unnamed: 0,cluster_id,url,row_id,table_id,Valid,brand,product_name,product_tokes,Amount
1081,1524820,https://www.sophiesclass.com/product/sony-a7-iii-full-frame-mirrorless-interchangeable-lens-camera-optical-with-3-inch-lcd-black-ilce7m3-b,964,Product_sophiesclass.com_September2020.json.gz,1,sony,sony a7 iii full frame mirrorless interchangeable lens camera optical with 3 inch lcd black ilce7m3 b,"[sony, a7, iii, full, frame, mirrorless, interchangeable, lens, camera, optical, with, 3, inch, lcd, black, ilce7m3, b]",95
26842,3078421,https://cameramall.com/products/alpha-a6100-mirrorless-camera-with-16-50mm-lens-black,302,Product_cameramall.com_September2020.json.gz,1,,alpha a6100 mirrorless camera with 16 50mm lens black,"[alpha, a6100, mirrorless, camera, with, 16, 50mm, lens, black]",14
36212,1112064,https://www.pictureline.com/products/sony-alpha-a6300-mirrorless-digital-camera-with-16-50mm-lens,1703,Product_pictureline.com_September2020.json.gz,1,,sony alpha a6300 mirrorless digital camera with 16 50mm lens,"[sony, alpha, a6300, mirrorless, digital, camera, with, 16, 50mm, lens]",6
15713,3078421,https://camerashopmuskegon.com/products/sony-alpha-a6100-mirrorless-camera-with-16-50mm-lens-black,193,Product_camerashopmuskegon.com_September2020.json.gz,1,,sony alpha a6100 mirrorless camera with 16 50mm lens black,"[sony, alpha, a6100, mirrorless, camera, with, 16, 50mm, lens, black]",14
13716,1112064,https://avcstore.com/products/sony-alpha-a6300-mirrorless-digital-camera-with-16-50mm-lens-black,622,Product_avcstore.com_September2020.json.gz,1,sony,sony alpha a6300 mirrorless digital camera with 16 50mm lens black,"[sony, alpha, a6300, mirrorless, digital, camera, with, 16, 50mm, lens, black]",6
...,...,...,...,...,...,...,...,...,...
58627,589089,https://www.ebuyer.com/911683-xiaomi-mi-smart-band-4-mgw4052gl,3600,Product_ebuyer.com_September2020.json.gz,1,,911683 xiaomi mi smart band 4 mgw4052gl,"[911683, xiaomi, mi, smart, band, 4, mgw4052gl]",9
56743,47841827,https://www.jumaikhemweb.com/product/xiaomi-mi-band-4-waterproof-smart-fitness-tracker,57,Product_jumaikhemweb.com_September2020.json.gz,1,xiaomi,xiaomi mi band 4 waterproof smart fitness tracker,"[xiaomi, mi, band, 4, waterproof, smart, fitness, tracker]",18
48614,1089851,https://www.opirata.com/p/bascula-inteligente-xiaomi-mi-smart-scale-2,1421,Product_opirata.com_September2020.json.gz,1,,bascula inteligente xiaomi mi smart scale 2,"[bascula, inteligente, xiaomi, mi, smart, scale, 2]",8
40035,2148094,https://www.madridgadgetstore.com/en/xiaomi/5415000-xiaomi-mi-smart-band-5-activity-bracelet.html,25,Product_madridgadgetstore.com_September2020.json.gz,1,,5415000 xiaomi mi smart band 5 activity bracelet,"[5415000, xiaomi, mi, smart, band, 5, activity, bracelet]",9


In [228]:
df_electroncis_final.drop_duplicates('cluster_id', keep='first').to_excel("Final_Electronics.xlsx")

## Cluster statistics for product category clothes

In [25]:
df_grouped_clothes = df_joined_clothes.groupby('cluster_id').count()

In [65]:
# only look at clusters that have at least one brand associated
df_set_clothes = df_grouped_clothes[df_grouped_clothes['brand']>0].reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

[560,
 1507,
 2420,
 2523,
 2663,
 2883,
 3048,
 3297,
 4231,
 4455,
 4497,
 5310,
 5382,
 5875,
 6340,
 6820,
 6913,
 7410,
 7715,
 7827,
 8176,
 8248,
 8261,
 8465,
 8492,
 8677,
 9667,
 10169,
 10432,
 11188,
 12068,
 12135,
 12615,
 13540,
 13678,
 13754,
 13807,
 13998,
 14463,
 14525,
 14595,
 15556,
 15616,
 15679,
 15696,
 16419,
 16543,
 16573,
 17233,
 17556,
 17624,
 17795,
 18030,
 18040,
 18304,
 18399,
 18661,
 18833,
 19118,
 19580,
 19623,
 20153,
 20224,
 20349,
 20622,
 20673,
 20689,
 21026,
 21775,
 22383,
 22551,
 22679,
 22865,
 23110,
 23617,
 23765,
 23995,
 24165,
 24378,
 24439,
 24894,
 25377,
 25486,
 25929,
 26237,
 26436,
 27007,
 27135,
 27137,
 27326,
 28368,
 28393,
 28605,
 29806,
 30634,
 31018,
 31090,
 31257,
 31305,
 31974,
 32267,
 33027,
 33200,
 33256,
 33559,
 33621,
 34157,
 34168,
 34213,
 34410,
 34845,
 34963,
 35217,
 35345,
 35357,
 35996,
 36121,
 36338,
 36709,
 36914,
 36985,
 37199,
 37402,
 38670,
 39407,
 39582,
 39852,
 40024,
 401

In [27]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there, so 1,6 million clusters remain
df_set_clothes=df_set_clothes[df_set_clothes['Amount']>1]
df_set_clothes

Unnamed: 0,cluster_id,Amount
0,560,2
1,1507,3
2,2420,3
3,2523,2
4,2663,3
...,...,...
105590,80424345,2
105604,80438085,2
105623,80472255,4
105624,80472257,4


In [28]:
df_10_clothes=df_set_clothes[df_set_clothes['Amount']>10]
df_10_clothes

Unnamed: 0,cluster_id,Amount
11,5310,11
183,58043,12
246,79412,15
279,90549,12
319,102437,14
...,...,...
79674,58592784,19
88341,65475235,11
92112,68554513,13
103333,78110534,13


In [29]:
#merge brand name to cluster amount
df_cluster_brand_clothes = df_10_clothes[df_10_clothes['Amount']<400].merge(df_joined_clothes.dropna()[['cluster_id','brand']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand_clothes

Unnamed: 0,cluster_id,Amount,brand
0,5310,11,armani
1,58043,12,burberry
2,79412,15,adidas
3,90549,12,louis vuitton
4,102437,14,chanel
...,...,...,...
100,58592784,19,nike
101,65475235,11,nike
102,68554513,13,nike
103,78110534,13,dolce & gabbana


In [30]:
#get the top clusters per brand
df_top_clusters_clothes = df_cluster_brand_clothes.sort_values(['Amount'], ascending=False).drop_duplicates(subset=["brand"], keep="first")
df_top_clusters_clothes

Unnamed: 0,cluster_id,Amount,brand
84,5672069,289,gucci
27,527917,110,chanel
62,1524820,95,prada
67,2132296,61,the north face
24,434734,53,adidas
71,2434654,46,louis vuitton
83,3324035,40,oakley
98,58083280,24,nike
78,2825229,24,calvin klein
47,944406,18,dolce & gabbana


In [31]:
df_joined_clothes[(df_joined_clothes['cluster_id']==22374915)]

Unnamed: 0,cluster_id,url,row_id,table_id,Valid,brand
1126480,22374915,https://06.aplikasionlineshop.com/index.php/product/unero-military-classical-backpack,14,Product_aplikasionlineshop.com_September2020.json.gz,1,
2128782,22374915,https://pensalo.com/product/unero-military-classical-backpack,16,Product_pensalo.com_September2020.json.gz,1,
5574153,22374915,https://nelau.com/?product=unero-military-classical-backpack,14,Product_nelau.com_September2020.json.gz,1,
5848416,22374915,https://pcmundo.com.mx/producto/panasonic-invertr-900l-refrigerator,70,Product_pcmundo.com.mx_September2020.json.gz,1,
6191685,22374915,https://ebonth.com/product/unero-military-classical-backpack,24,Product_ebonth.com_September2020.json.gz,1,
6994438,22374915,https://winnershopbd.com/product/unero-military-classical-backpack,22,Product_winnershopbd.com_September2020.json.gz,1,
8350514,22374915,https://ikotaonline.com/product/unero-military-classical-backpack,15,Product_ikotaonline.com_September2020.json.gz,1,
8765168,22374915,https://grixbase.com/product/graco-slim-snacker-high-chair-whisk,26,Product_grixbase.com_September2020.json.gz,1,
9447506,22374915,https://www.digitalshopup.com/product/unero-military-classical-backpack,6,Product_digitalshopup.com_September2020.json.gz,1,
10271721,22374915,https://ktrworld.com/product/unero-military-classical-backpack,78,Product_ktrworld.com_September2020.json.gz,1,


In [33]:
#get a new column with the product names from the url
df_joined_clothes['product_name'] = df_joined_clothes['url'].apply(lambda row: row.split("/")[-1])

In [34]:
df_joined_clothes

Unnamed: 0,cluster_id,url,row_id,table_id,Valid,brand,product_name
0,59178314,https://www.cultureindoor.com/796-irrigation-arrosage-tuyau-pe-20mm-semi-rigide-rouleau-100m.html,1561,Product_cultureindoor.com_September2020.json.gz,1,,796-irrigation-arrosage-tuyau-pe-20mm-semi-rigide-rouleau-100m.html
1,70692685,https://www.culturekings.com/products/new-era-new-era-new-orleans-pelicans-nba-940-a-frame-snapback-black,3893,Product_culturekings.com_September2020.json.gz,1,,new-era-new-era-new-orleans-pelicans-nba-940-a-frame-snapback-black
2,395606,https://www.culturekings.com/products/new-era-x-nrl-west-tigers-aframe-charcoal-orange,3894,Product_culturekings.com_September2020.json.gz,1,,new-era-x-nrl-west-tigers-aframe-charcoal-orange
3,3361797,https://www.customizedgirl.com/design/2605608/The+Real+Baseball+Moms,9318,Product_customizedgirl.com_September2020.json.gz,1,,The+Real+Baseball+Moms
4,235755,https://www.customizedgirl.com/design/2613423/Mentally+Dating+Ronaldo,9319,Product_customizedgirl.com_September2020.json.gz,1,,Mentally+Dating+Ronaldo
...,...,...,...,...,...,...,...
24687502,8653717,https://shop-list.com/women/utuwa-tatara/sg3232-03,1620,Product_shop-list.com_September2020.json.gz,1,,sg3232-03
24687503,22258637,https://shop-list.com/women/youmotto/famille-ltm,1621,Product_shop-list.com_September2020.json.gz,1,,famille-ltm
24687504,41434658,https://shop.agwaycapecod.com/products/019014710959/adult-small-breed-dog-food-5-lb,126,Product_agwaycapecod.com_September2020.json.gz,1,,adult-small-breed-dog-food-5-lb
24687505,62702145,https://shop.agwaycapecod.com/products/037321455042/ant-killer-dust-3-lbs,127,Product_agwaycapecod.com_September2020.json.gz,1,,ant-killer-dust-3-lbs


In [233]:
#get a new column with the product names from the url
df_joined_clothes['product_name'] = df_joined_clothes['url'].apply(lambda row: row.split("/")[-1])
#clean product column and lowercase
df_joined_clothes['product_name'] = df_joined_clothes['product_name'].str.replace('+',' ').str.replace('-',' ').str.replace('.html', '').apply(lambda row: row.lower())
df_joined_clothes
#get only cluster ids with at least one brand electronics
df_compare_clothes = df_joined_clothes[df_joined_clothes['cluster_id'].isin(df_set_clothes['cluster_id'].tolist())]
#merge with set to get amount of tables per cluster in overview
df_compare_clothes = df_compare_clothes.merge(df_set_clothes, left_on=['cluster_id'], right_on = ['cluster_id'], how='left')

  df_joined_clothes['product_name'] = df_joined_clothes['product_name'].str.replace('+',' ').str.replace('-',' ').str.replace('.html', '').apply(lambda row: row.lower())
  df_joined_clothes['product_name'] = df_joined_clothes['product_name'].str.replace('+',' ').str.replace('-',' ').str.replace('.html', '').apply(lambda row: row.lower())


In [234]:
#use tokenizer for product names to get tokes for training the model
df_compare_clothes['product_tokes'] = df_compare_clothes['product_name'].apply(lambda row: word_tokenize(row))
#get tagged words
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df_compare_clothes['product_tokes'])]
# build model and vocabulary
model = Doc2Vec(vector_size=50, min_count = 5, epochs = 25, dm = 0)
model.build_vocab(tagged_data)
# Train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=25)

In [301]:
df_joined_clothes[df_joined_clothes['cluster_id']==78110534]

Unnamed: 0,cluster_id,url,row_id,table_id,Valid,brand,product_name
15925,78110534,https://www.cn.forzieri.com/chn/product_view.asp?l=chn&c=chn&dept_id=18&sku=dg130320-027-00,3400,Product_forzieri.com_September2020.json.gz,1,,product_view.asp?l=chn&c=chn&dept_id=18&sku=dg130320 027 00
345143,78110534,https://www.be.forzieri.com/sacs-a-main/dolce-gabbana/dg130320-027-00,462,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
873827,78110534,https://www.kr.forzieri.com/kor/product_view.asp?l=kor&c=kor&dept_id=18&sku=dg130320-027-00,2191,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,product_view.asp?l=kor&c=kor&dept_id=18&sku=dg130320 027 00
3926168,78110534,https://www.se.forzieri.com/handbags/dolce-gabbana/dg130320-027-00,1686,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
4659026,78110534,https://www.eu.forzieri.com/handbags/dolce-gabbana/dg130320-027-00,445,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
7183796,78110534,https://www.uk.forzieri.com/handbags/dolce-gabbana/dg130320-027-00,2108,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
17638900,78110534,https://www.ch.forzieri.com/handtaschen/dolce-gabbana/dg130320-027-00,8,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
19486553,78110534,https://www.ca.forzieri.com/handbags/dolce-gabbana/dg130320-027-00,235,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
19839038,78110534,https://www.no.forzieri.com/handbags/dolce-gabbana/dg130320-027-00,3010,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,dg130320 027 00
20277399,78110534,https://www.sa.forzieri.com/esp/product_view.asp?l=esp&c=usa&dept_id=18&sku=dg130320-027-00,1076,Product_forzieri.com_September2020.json.gz,1,dolce & gabbana,product_view.asp?l=esp&c=usa&dept_id=18&sku=dg130320 027 00


In [302]:
#get cluster ids and with that indices of top products to use model
top_clusters_list = [5310, 58043,104343,142594,174327, 186753,421372,677207,834201, 881202,  895708,939889, 1249086,1290229, 1852022,2459966, 2732926 , 22374915, 22374918, 26097914,44159446, 58592784, 78110534   ]
index_top_clusters_list=[]
for id in top_clusters_list:
    index_top_clusters_list.append(df_compare_clothes[df_compare_clothes['cluster_id']==id].index[0])

In [309]:
# get most similar products for each of the base clusters and save them if they have more than ten tables
clothes_clusters_search=[]
for i in index_top_clusters_list:
    similar_doc = model.docvecs.most_similar(f'{i}', topn = 15)
    clothes_clusters_search.append(int(i))
    for index, similarity in similar_doc:
        if df_compare_clothes.iloc[int(index)]['Amount']>5:
            clothes_clusters_search.append(int(index))
df_clothes_final = df_compare_clothes.iloc[clothes_clusters_search]

  similar_doc = model.docvecs.most_similar(f'{i}', topn = 15)


In [310]:
df_clothes_final.drop_duplicates('cluster_id', keep='first').to_excel("Final_Clothes.xlsx")