# This notebook is used to to preprocess the final dataset categories, in the first part the logic was created on two categories (electronics and clothes). While the later part shows how the logic was used for the final categories.

In [1]:
import os
import pandas as pd
import progressbar
from nltk.corpus import stopwords
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
pd.set_option('display.max_colwidth', None)

In [2]:
#function to remove words from an input list in a token_vector
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [3]:
#function to remove punctuation in a token_vector
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [4]:
#function to compute jaccard similarity
def jaccard_similarity_score(original, translation):
    intersect = set(original).intersection(set(translation))
    union = set(original).union(set(translation))
    try:
        return len(intersect) / len(union)
    except ZeroDivisionError:
        return 0

## Get the paths and build table names for iteration

In [5]:
data_path = '../src/data'
mapping_corpus_path = data_path + r'/product/lspc2020_to_tablecorpus'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
table_corpus_path = data_path + r'/product/product_top100/cleaned'
table_corpus_path_with_id = data_path + r'/product/product_top100/cleaned/with_id'
table_corpus_path2 = data_path + r'/product/product_minimum3/cleaned/with_id'

In [6]:
zip_files_mapping = [file for file in os.listdir(mapping_corpus_path_2) if file.endswith('.json.gz')]
zip_files_tables = [file for file in os.listdir(table_corpus_path) if file.endswith('.json.gz')]

## Match the number dictionaries with the information about the brand


In [2]:
#get whole concatenated table
df_large = pd.read_json(os.path.join(mapping_corpus_path_2, 'df_large_matched.json'), compression='gzip', orient='records', lines=True)

In [22]:
#get information about clusters with clothes
df_joined_clothes = pd.read_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_8_tables_post_processed.csv'))

In [8]:
#get information about clusters with clothes
df_joined_electronics = pd.read_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_10_tables_post_processed.csv'))

## Get information about electronic clusters

In [11]:
df_grouped_electronics = df_joined_electronics.groupby('cluster_id').count()
# only look at clusters that have at least one brand associated
df_set_electronics = df_grouped_electronics.reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [12]:
df_10_electronics=df_set_electronics[df_set_electronics['Amount']>10]

In [13]:
df_10_electronics

Unnamed: 0,cluster_id,Amount
2,3668,11
3,6076,14
4,6443,13
5,6505,13
6,6690,17
...,...,...
615,52362093,20
616,53457772,61
617,64651308,11
618,66614988,12


In [15]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there
df_set_electronics=df_set_electronics[df_set_electronics['Amount']>1]
df_15_electronics=df_set_electronics[df_set_electronics['Amount']>12]
df_15_electronics

Unnamed: 0,cluster_id,Amount
3,6076,14
4,6443,13
5,6505,13
6,6690,17
8,7366,13
...,...,...
611,48051979,13
613,48051984,13
615,52362093,20
616,53457772,61


In [None]:
#merge brand name to cluster amount
df_cluster_brand = df_15_electronics[df_15_electronics['Amount']<200].merge(df_joined_electronics.dropna()[['cluster_id','brand_y']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand

In [16]:
#clean product column and lowercase
df_joined_electronics=df_joined_electronics.dropna(subset = ['name'])
df_joined_electronics['name'] = df_joined_electronics['name'].apply(lambda row: row.lower())
#get only cluster ids with at least one brand electronics
df_compare_electronics = df_joined_electronics[df_joined_electronics['cluster_id'].isin(df_set_electronics['cluster_id'].tolist())]
#merge with set to get amount of tables per cluster in overview
df_compare_electronics = df_compare_electronics.merge(df_set_electronics, left_on=['cluster_id'], right_on = ['cluster_id'], how='left')

In [17]:
#use tokenizer for product names to get tokes for training the model and clean the column
df_compare_electronics['product_tokes'] = df_compare_electronics['name'].apply(lambda row: word_tokenize(row))
df_compare_electronics['product_tokes'] = remove_stopwords(df_compare_electronics['product_tokes'],stopwords.words())
df_compare_electronics['product_tokes'] = remove_punctuation (df_compare_electronics['product_tokes'])
#get tagged words
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df_compare_electronics['product_tokes'])]
# build model and vocabulary
model = Doc2Vec(vector_size=50, min_count = 5, epochs = 25, dm = 0)
model.build_vocab(tagged_data)
# Train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=25)

In [26]:
#get cluster ids for basline products and with that indices of top products to use model
top_clusters_list = df_15_electronics['cluster_id'].tolist()
index_top_clusters_list=[]
for id in top_clusters_list:
    index_top_clusters_list.append(df_compare_electronics[df_compare_electronics['cluster_id']==id].index[0])

In [28]:
# get most similar products for each of the base clusters and save them if they have more than 5 tables
electronics_clusters_search=[]
for i in index_top_clusters_list:
    similar_doc = model.docvecs.most_similar(f'{i}', topn = 20)
    electronics_clusters_search.append(int(i))
    for index, similarity in similar_doc:
        if df_compare_electronics.iloc[int(index)]['Amount']>7:
            electronics_clusters_search.append(int(index))
    jaccard_score = df_compare_electronics['product_tokes'].apply(lambda row: jaccard_similarity_score(row,df_compare_electronics.iloc[int(i)]['product_tokes']) )
    indizes=sorted(range(len(jaccard_score)), key=lambda i: jaccard_score[i])[-20:]
    for index in indizes:
         if df_compare_electronics.iloc[int(index)]['Amount']>7:
            electronics_clusters_search.append(int(index))    
df_electroncis_final = df_compare_electronics.iloc[electronics_clusters_search]

In [29]:
df_electroncis_final.drop_duplicates('cluster_id', keep='first').to_excel("Final_Electronics_v4.xlsx")

## Cluster statistics for product category clothes

In [24]:
df_grouped_clothes = df_joined_clothes.groupby('cluster_id').count()

In [25]:
# only look at clusters that have at least one brand associated
df_set_clothes = df_grouped_clothes[df_grouped_clothes['brand_y']>0].reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [26]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there
df_set_clothes=df_set_clothes[df_set_clothes['Amount']>1]
df_set_clothes

Unnamed: 0,cluster_id,Amount
0,16573,6
1,18496,8
2,37366,3
3,37525,9
4,42322,8
...,...,...
282,77602818,15
283,78110534,12
284,78499693,15
285,78736275,8


In [27]:
df_8_clothes=df_set_clothes[df_set_clothes['Amount']>7]
df_8_clothes

Unnamed: 0,cluster_id,Amount
3,37525,9
5,58043,9
6,70109,10
12,127282,9
15,142594,9
...,...,...
281,77104640,9
282,77602818,15
283,78110534,12
284,78499693,15


In [31]:
df_10_clothes=df_set_clothes[df_set_clothes['Amount']>10]
df_10_clothes

Unnamed: 0,cluster_id,Amount
16,148199,20
17,150322,11
20,186753,12
24,200956,15
34,308891,13
38,363117,13
39,371522,11
40,390077,13
58,570162,11
66,668889,12


In [19]:
#merge brand name to cluster amount
df_cluster_brand_clothes = df_10_clothes[df_10_clothes['Amount']<400].merge(df_joined_clothes.dropna()[['cluster_id','brand_y']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand_clothes

Unnamed: 0,cluster_id,Amount,brand_y
0,5310,11,armani
1,37366,14,tag heuer
2,37525,12,coach
3,42322,14,tag heuer
4,58043,12,
...,...,...,...
262,68554513,13,
263,77602818,16,valentino
264,78110534,13,dolce & gabbana
265,78499693,16,dolce & gabbana


In [28]:
df_joined_clothes=df_joined_clothes.dropna(subset = ['name'])
#clean product column and lowercase
df_joined_clothes['name'] = df_joined_clothes['name'].apply(lambda row: row.lower())
df_joined_clothes
#get only cluster ids with at least one brand electronics
df_compare_clothes = df_joined_clothes[df_joined_clothes['cluster_id'].isin(df_set_clothes['cluster_id'].tolist())]
#merge with set to get amount of tables per cluster in overview
df_compare_clothes = df_compare_clothes.merge(df_set_clothes, left_on=['cluster_id'], right_on = ['cluster_id'], how='left')

In [29]:
#use tokenizer for product names to get tokes for training the model
df_compare_clothes['product_tokes'] = df_compare_clothes['name'].apply(lambda row: word_tokenize(row))
df_compare_clothes['product_tokes'] = remove_stopwords(df_compare_clothes['product_tokes'],stopwords.words())
df_compare_clothes['product_tokes'] = remove_punctuation (df_compare_clothes['product_tokes'])
#get tagged words
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df_compare_clothes['product_tokes'])]
# build model and vocabulary
model = Doc2Vec(vector_size=50, min_count = 5, epochs = 25, dm = 0)
model.build_vocab(tagged_data)
# Train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=25)

In [32]:
#get cluster ids and with that indices of top products to use model
top_clusters_list = df_10_clothes['cluster_id'].tolist()
index_top_clusters_list=[]
for id in top_clusters_list:
    index_top_clusters_list.append(df_compare_clothes[df_compare_clothes['cluster_id']==id].index[0])

In [35]:
# get most similar products for each of the base clusters and save them if they have more than 5 tables
clothes_clusters_search=[]
for i in index_top_clusters_list:
    similar_doc = model.docvecs.most_similar(f'{i}', topn = 20)
    clothes_clusters_search.append(int(i))
    for index, similarity in similar_doc:
        if df_compare_clothes.iloc[int(index)]['Amount']>8:
            clothes_clusters_search.append(int(index))
    jaccard_score = df_compare_clothes['product_tokes'].apply(lambda row: jaccard_similarity_score(row,df_compare_clothes.iloc[int(i)]['product_tokes']) )
    indizes=sorted(range(len(jaccard_score)), key=lambda i: jaccard_score[i])[-20:]
    for index in indizes:
         if df_compare_clothes.iloc[int(index)]['Amount']>8:
            clothes_clusters_search.append(int(index))    
df_clothes_final = df_compare_clothes.iloc[clothes_clusters_search]

In [36]:
df_clothes_final.drop_duplicates('cluster_id', keep='first').to_excel("Final_Clothes_v4.xlsx")

# The above provided logic will be used on all final categories

In [54]:
df_joined_clothes = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Clothes_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_clothes['category']='clothes'
df_joined_electronics = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Electronics_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_electronics['category']='electronics'
df_joined_bikes = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Bikes_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_bikes['category']='bikes'
df_joined_cars = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Cars_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_cars['category']='cars'
df_joined_technology = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Technology_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_technology['category']='technology'
df_joined_drugstore = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Drugstore_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_drugstore['category']='drugstore'
df_joined_tools = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Tools_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_tools['category']='tools'
df_joined_random = pd.read_csv(os.path.join(mapping_corpus_path_2, 'Random_clusters_all_8_tables_post_processed_lower_threshold.csv'))
df_joined_random['category']='random'

In [62]:
#concatenate all information
frames = [df_joined_electronics, df_joined_clothes,df_joined_bikes,df_joined_cars,df_joined_technology,df_joined_drugstore,df_joined_tools,df_joined_random]
df_concat = pd.concat(frames).drop(columns = ['Unnamed: 0','Unnamed: 0.1','Valid'])
df_concat

Unnamed: 0,cluster_id,url,row_id,table_id,name,description,brand_x,brand_y,tokens,category
0,2456244,https://shop-stjohns.simplyforlife.com/quest-chips-cheddar-sour-cream-32g.html,1958,Product_simplyforlife.com_September2020.json.gz,"quest nutrition quest - chips, cheddar & sour cream (32g)",,Quest Nutrition,,"['quest', 'nutrition', 'quest', 'chips', 'cheddar', 'sour', 'cream', '32g']",electronics
1,9885,https://www.tecnoprecios.net/t%C3%B3ner-original/497-toner-canon-723-amarillo-8500-paginas.html,607,Product_tecnoprecios.net_September2020.json.gz,toner canon 723 amarillo 8500 páginas,TONER CANON 723 AMARILLO 8500 PáGINAS LBP7750CDN,,,"['toner', 'canon', '723', 'amarillo', '8500', 'páginas']",electronics
2,16617,https://simplifinetworks.com/v2/point-to-multi-point/266-nanobeam-ac-gen2.html,167,Product_simplifinetworks.com_September2020.json.gz,nanobeam-ac-gen2,NBE-5AC-Gen2The sleek NanoBeam design with proprietary airMAX ac chipset and dedicated management Wi-Fi for easy U Mobile app support and fast setup. This airMAX ac CPE pivots on a ball joint for mounting and alignment flexibility.DATA SHEET,,,['nanobeam-ac-gen2'],electronics
3,323989,https://store.macs-4-u.com.au/usb-c-charge-cable-1-m.html,192,Product_macs-4-u.com.au_September2020.json.gz,usb-c charge cable (1 m),USB-C Charge Cable (1 m),,,"['usb-c', 'charge', 'cable', '1']",electronics
4,210849,https://www.cingolanibikeshop.com/specialized-sella-power-comp-nero-143.html,447,Product_cingolanibikeshop.com_September2020.json.gz,specialized sella power comp nero 143,SELLA POWER COMP NERO 143,Specialized,,"['specialized', 'sella', 'power', 'comp', 'nero', '143']",electronics
...,...,...,...,...,...,...,...,...,...,...
21663,35840949,https://www.taffscarpets.com/american-olean-quarry-tile-reds--oranges-canyon-red-2-tile-flooring-0q0122chipa,47,Product_taffscarpets.com_September2020.json.gz,quarry tile canyon red q01 1,"Taff's Carpets in Kearney has a top selection of American Olean Ceramic & Porcelain Tile Flooring, including Quarry Tile Canyon Red in 2\"" x 2\""",American Olean,american olean,"['quarry', 'canyon', 'red', 'q01', '1']",random
21664,36306411,https://www.mainstreetfloorcoveringvermont.com/american-olean-mirasol-whites--creams-silver-marble-12-tile-flooring-ml7212121p,35,Product_mainstreetfloorcoveringvermont.com_September2020.json.gz,mirasol silver marble ml72,"Main Street Floor Covering in Essex Junction has a top selection of American Olean Ceramic & Porcelain Tile Flooring, including Mirasol™ Silver Marble in 12\"" x 12\""",American Olean,american olean,"['mirasol', 'silver', 'marble', 'ml72']",random
21665,62520180,https://www.thefloordrs.com/american-olean-quarry-naturals-beiges--browns-desert--2-tile-flooring-0n0322chipa,59,Product_thefloordrs.com_September2020.json.gz,quarry naturals desertn03 1,"The Floor Doctors in Des Moines has a top selection of American Olean Ceramic & Porcelain Tile Flooring, including Quarry Naturals® Desert * in 2\"" x 2\""",American Olean,american olean,"['quarry', 'naturals', 'desertn03', '1']",random
21666,56187443,https://www.amitybicycles.com/product/garmin-edge-510-silicone-case-198697-1.htm,353,Product_amitybicycles.com_September2020.json.gz,edge 510 silicone case,"Protect your Garmin Edge 510 with a form-fitting, silicone case and keep your mind at ease. Choose from a variety of cool colors to match your personal style.",,,"['edge', '510', 'silicone', 'case']",random


In [63]:
df_grouped= df_concat.groupby('cluster_id').count()
# only look at clusters that have at least one brand associated
df_set = df_grouped.reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [64]:
#get onl info
df_set=df_set[df_set['Amount']>7]

In [65]:
df_10=df_set[df_set['Amount']>15]

In [66]:
df_10

Unnamed: 0,cluster_id,Amount
11,6076,18
12,6443,21
15,6690,19
25,9046,19
32,12841,23
...,...,...
3120,77483648,500
3126,78260404,78
3135,79641193,17
3136,79862908,22


In [67]:
#clean product column and lowercase
df_concat=df_concat.dropna(subset = ['name'])
df_concat['name'] = df_concat['name'].apply(lambda row: row.lower())
df_concat
#get only cluster ids with at least one brand electronics
df_compare = df_concat[df_concat['cluster_id'].isin(df_set['cluster_id'].tolist())]
#merge with set to get amount of tables per cluster in overview
df_compare = df_compare.merge(df_set, left_on=['cluster_id'], right_on = ['cluster_id'], how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_concat['name'] = df_concat['name'].apply(lambda row: row.lower())


In [68]:
df_compare

Unnamed: 0,cluster_id,url,row_id,table_id,name,description,brand_x,brand_y,tokens,category,Amount
0,2456244,https://shop-stjohns.simplyforlife.com/quest-chips-cheddar-sour-cream-32g.html,1958,Product_simplyforlife.com_September2020.json.gz,"quest nutrition quest - chips, cheddar & sour cream (32g)",,Quest Nutrition,,"['quest', 'nutrition', 'quest', 'chips', 'cheddar', 'sour', 'cream', '32g']",electronics,23
1,16617,https://simplifinetworks.com/v2/point-to-multi-point/266-nanobeam-ac-gen2.html,167,Product_simplifinetworks.com_September2020.json.gz,nanobeam-ac-gen2,NBE-5AC-Gen2The sleek NanoBeam design with proprietary airMAX ac chipset and dedicated management Wi-Fi for easy U Mobile app support and fast setup. This airMAX ac CPE pivots on a ball joint for mounting and alignment flexibility.DATA SHEET,,,['nanobeam-ac-gen2'],electronics,18
2,323989,https://store.macs-4-u.com.au/usb-c-charge-cable-1-m.html,192,Product_macs-4-u.com.au_September2020.json.gz,usb-c charge cable (1 m),USB-C Charge Cable (1 m),,,"['usb-c', 'charge', 'cable', '1']",electronics,12
3,345616,https://www.scan.co.uk/products/6tb-wd-red-pro-wd6003ffbx-35-nas-hdd-sata-iii-6gb-s-7200rpm-256mb-cache-shock-sensor-ncq-oem,4945,Product_scan.co.uk_September2020.json.gz,"wd red pro 6tb 3.5\"" sata nas hdd/hard drive","6TB WD Red Pro WD6003FFBX, 3.5\"" NAS HDD, SATA III 6Gb/s, 7200rpm, 256MB Cache, Shock Sensor, NCQ, OEM",WD,,"['wd', 'red', 'pro', '6tb', '3.5\\', ""''"", 'sata', 'hdd/hard', 'drive']",electronics,13
4,213496,https://de.camerok.com/Sony_SEL30M35_camera_lense,644,Product_camerok.com_September2020.json.gz,sony sel30m35 camera lense,"Dieses Objektiv wurde entwickelt, um bieten vielseitige Hochleistungs-makro-Funktionen in einem kompakten, leichten Körper. Es ist eine echte 1:1 makro-Objektiv mit einer erstaunlichen Naheinstellgrenze von nur knapp 1 Zoll.Die SEL30M35 wurde entwickelt, um bieten vielseitige Hochleistungs-makro-Funktionen in einem Objektiv ist kompakt, leicht und einfach zu bedienen. Es ist eine echte 1:1 makro-Objektiv mit 2,4 Zentimeter minimale Arbeitsabstand ermöglicht winzigen Themen und details zu erbringenden mit hervorragender Auflösung und Kontrast.1:1 AbbildungsmaßstabDie SEL30M35 hat eine 1:1-Abbildungsmaßstab so, dass die Probanden erfasst werden können, werden in voller Größe auf dem Bildsensor der Kamera. Dies macht es einfach, um zu Schießen Qualität makro-Nahaufnahmen von Blumen, Insekten und kleine Objekte.Asphärische/ED-Glas-ElementeDie SEL30M35 bietet eine hervorragende Bildqualität über den gesamten Bildbereich, mit hohem Kontrast und minimale chromatische aberration von unendlich bis 1:1 Vergrößerung bei allen Blendeneinstellungen. ED-Glas verwendet wird, für die G4-element, minimieren die chromatische aberration und die Farbe Blutungen sowie multi-Beschichtung während für besonders saubere, klare rendering.Gemacht für FilmeDie SEL30M35 ebenso führt auch bei Videoaufnahmen. Dank eines internen Schrittmotor und hinten-mit Schwerpunkt design ist optimiert für video-performance für Laufruhe Grundvoraussetzung für hochwertige Film zu erfassen. Darüber hinaus werden die filter Halterung nicht drehen, während der Schwerpunkt für die problemlose polarisierten filter verwenden.Glatte Manuelle FokussierungEin easy-grip-Fokus-ring auf dem Objektivtubus gibt Ihnen eine Feste \""hands-on\"" fühlen sich in der manuellen Fokussierung. Zusätzlich, Direct Manual Focus (DMF) können Sie den Fokus manuell nach AF lock-on, ohne die Notwendigkeit für einen Wechsel. Dies kann sehr nützlich in Situationen, in denen der AF nicht sperren auf das Thema genau, oder wenn AF Schlösser auf einen Punkt, der nicht mit Ihrer Absicht. Elektronische manueller Fokus-Technologie wurde ursprünglich entwickelt für den professionellen Sony Camcorder, um glatte, präzise Fokussierung.",Sony,,"['sony', 'sel30m35', 'camera', 'lense']",electronics,11
...,...,...,...,...,...,...,...,...,...,...,...
57772,617074,https://www.giantrenosparks.com/product/garmin-forerunner-245-365508-1.htm,128,Product_giantrenosparks.com_September2020.json.gz,forerunner 245,"You do the running. Forerunner 245 does the thinking. This GPS smartwatch doesn’t just tell you where you ran. It tracks your stats, crunches the numbers and gets to know all about your...",,,"['forerunner', '245']",random,110
57773,35840949,https://www.taffscarpets.com/american-olean-quarry-tile-reds--oranges-canyon-red-2-tile-flooring-0q0122chipa,47,Product_taffscarpets.com_September2020.json.gz,quarry tile canyon red q01 1,"Taff's Carpets in Kearney has a top selection of American Olean Ceramic & Porcelain Tile Flooring, including Quarry Tile Canyon Red in 2\"" x 2\""",American Olean,american olean,"['quarry', 'canyon', 'red', 'q01', '1']",random,10
57774,36306411,https://www.mainstreetfloorcoveringvermont.com/american-olean-mirasol-whites--creams-silver-marble-12-tile-flooring-ml7212121p,35,Product_mainstreetfloorcoveringvermont.com_September2020.json.gz,mirasol silver marble ml72,"Main Street Floor Covering in Essex Junction has a top selection of American Olean Ceramic & Porcelain Tile Flooring, including Mirasol™ Silver Marble in 12\"" x 12\""",American Olean,american olean,"['mirasol', 'silver', 'marble', 'ml72']",random,22
57775,56187443,https://www.amitybicycles.com/product/garmin-edge-510-silicone-case-198697-1.htm,353,Product_amitybicycles.com_September2020.json.gz,edge 510 silicone case,"Protect your Garmin Edge 510 with a form-fitting, silicone case and keep your mind at ease. Choose from a variety of cool colors to match your personal style.",,,"['edge', '510', 'silicone', 'case']",random,28


In [69]:
#use tokenizer for product names to get tokes for training the model
df_compare['product_tokes'] = df_compare['name'].apply(lambda row: word_tokenize(row))
df_compare['product_tokes'] = remove_stopwords(df_compare['product_tokes'],stopwords.words())
df_compare['product_tokes'] = remove_punctuation (df_compare['product_tokes'])
#get tagged words
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df_compare['product_tokes'])]
# build model and vocabulary
model = Doc2Vec(vector_size=50, min_count = 5, epochs = 25, dm = 0)
model.build_vocab(tagged_data)
# Train model
model.train(tagged_data, total_examples=model.corpus_count, epochs=25)

In [70]:
#get cluster ids for basline products and with that indices of top products to use model
top_clusters_list = df_10['cluster_id'].tolist()
index_top_clusters_list=[]
for id in top_clusters_list:
    index_top_clusters_list.append(df_compare[df_compare['cluster_id']==id].index[0])

In [3]:
# get most similar products for each of the base clusters and save them if they have more than 7 tables
count = 0
clusters_search=[]
with progressbar.ProgressBar(max_value=len(index_top_clusters_list)) as bar:
    for i in index_top_clusters_list:
        similar_doc = model.docvecs.most_similar(f'{i}', topn = 20)
        clusters_search.append(int(i))
        for index, similarity in similar_doc:
            if df_compare.iloc[int(index)]['Amount']>7:
                if df_compare.iloc[int(index)]['category']==df_compare.iloc[int(i)]['category']:
                    clusters_search.append(int(index))
        jaccard_score = df_compare['product_tokes'].apply(lambda row: jaccard_similarity_score(row,df_compare.iloc[int(i)]['product_tokes']) )
        indizes=sorted(range(len(jaccard_score)), key=lambda i: jaccard_score[i])[-20:]
        for index in indizes:
            if df_compare.iloc[int(index)]['Amount']>7:
                if df_compare.iloc[int(index)]['category']==df_compare.iloc[int(i)]['category']:
                    clusters_search.append(int(index)) 
        count=count+1
        bar.update(count)
df_final = df_compare.iloc[clusters_search]

In [None]:
df_final.drop_duplicates('cluster_id', keep='first').to_excel("Final_lower threshold.xlsx")