In [48]:
import pandas as pd
import json 
import gzip
import os

import nltk
from nltk.corpus import stopwords
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [49]:
data_path = '../src/data'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
notebook_path = '../notebooks'
product_path = os.path.join(data_path, 'product')

In [50]:
cluster_df = pd.read_json(os.path.join(mapping_corpus_path_2, 'df_large_matched.json'), compression='gzip', orient='records', lines=True)

In [51]:
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics10.csv'), index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

In [52]:
clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes8.csv'), index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [118]:
electronics_clusters_all_15_df = cluster_df[cluster_df['cluster_id'].isin(electronics_final_entities_list)]
clothes_clusters_all_10_df = cluster_df[cluster_df['cluster_id'].isin(clothes_final_entities_list)]

electronics_clusters_all_15_df.to_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_10_tables.csv'), columns=None)
clothes_clusters_all_10_df.to_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_8_tables.csv'), columns=None)

# Run from here

In [85]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

def jaccard_similarity_score(original, translation):
    intersect = set(original).intersection(set(translation))
    union = set(original).union(set(translation))
    try:
        return len(intersect) / len(union)
    except ZeroDivisionError:
        return 0

In [86]:
# read final dataframes with all cluster_ids left for electronics and clothes
electronics_clusters_all_15_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_10_tables.csv'), index_col=None)
clothes_clusters_all_10_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_8_tables.csv'), index_col=None)

In [87]:
# generate lists for final cluster_ids for electronics and clothes
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics10.csv'),index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes8.csv'),index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [88]:
# generate lists for valid electronics and clothes brands
with open(os.path.join(product_path, 'brands_dict.json'), 'r', encoding='utf-8') as f:
    brands_dict = json.load(f)

electronics_valid_brands = brands_dict['electronics_total']
clothes_valid_brands = brands_dict['clothes']

In [89]:
# lowercase name column for similarity measure
electronics_clusters_all_15_df['name'] = electronics_clusters_all_15_df['name'].apply(lambda row: str(row).lower())
clothes_clusters_all_10_df['name'] = clothes_clusters_all_10_df['name'].apply(lambda row: str(row).lower())

In [90]:
# use tokenizer for name column to get tokens for training the model, remove stopwords and punctuation
electronics_clusters_all_15_df['tokens'] = electronics_clusters_all_15_df['name'].apply(lambda row: word_tokenize(row))
electronics_clusters_all_15_df['tokens'] = remove_stopwords(electronics_clusters_all_15_df['tokens'], stopwords.words())
electronics_clusters_all_15_df['tokens'] = remove_punctuation(electronics_clusters_all_15_df['tokens'])

clothes_clusters_all_10_df['tokens'] = clothes_clusters_all_10_df['name'].apply(lambda row: word_tokenize(row))
clothes_clusters_all_10_df['tokens'] = remove_stopwords(clothes_clusters_all_10_df['tokens'],stopwords.words())
clothes_clusters_all_10_df['tokens'] = remove_punctuation(clothes_clusters_all_10_df['tokens'])

In [91]:
# get tagged words
tagged_data_electronics = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(electronics_clusters_all_15_df['tokens'])]
tagged_data_clothes = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(clothes_clusters_all_10_df['tokens'])]

In [92]:
# build model and vocabulary for electronics (do same for clothes later)
model_electronics = Doc2Vec(vector_size=50, min_count=5, epochs=25, dm=0)
model_electronics.build_vocab(tagged_data_electronics)
# Train model
model_electronics.train(tagged_data_electronics, total_examples=model_electronics.corpus_count, epochs=25)

### Change index label for testing here

In [93]:
electronics_single_cluster_id_df = electronics_clusters_all_15_df[electronics_clusters_all_15_df['cluster_id']==6443]

In [94]:
valid_brands = list(filter(lambda brand: brand in electronics_valid_brands, electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower())))
valid_brands

['canon']

In [95]:
most_common_brand = max(valid_brands, key=valid_brands.count)
most_common_brand

'canon'

In [96]:
index_most_common = electronics_single_cluster_id_df[electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower()) == most_common_brand].index[0]
index_most_common

12364

In [97]:
similar_doc = model_electronics.docvecs.most_similar(f'{index_most_common}', topn=electronics_clusters_all_15_df.shape[0])
similar_doc_cluster = [tup for tup in similar_doc if int(tup[0]) in list(electronics_single_cluster_id_df.index)]
similar_doc_cluster

[('21566', 0.9851230382919312),
 ('3948', 0.8569894433021545),
 ('20010', 0.8507872819900513),
 ('25232', 0.8477321267127991),
 ('3836', 0.8462553024291992),
 ('23270', 0.8415756821632385),
 ('15601', 0.8415008187294006),
 ('2904', 0.8403884768486023),
 ('9752', 0.8388829827308655),
 ('4784', 0.8385568857192993),
 ('21195', 0.8344157934188843),
 ('6900', 0.831012487411499),
 ('20373', 0.8295968174934387),
 ('6153', 0.829455554485321),
 ('3329', 0.8289289474487305),
 ('25194', 0.8281686902046204),
 ('22589', 0.827642023563385),
 ('21642', 0.8270741701126099),
 ('20288', 0.8238865733146667),
 ('24369', 0.8164963126182556),
 ('6326', 0.7067199349403381),
 ('14616', 0.6413354277610779),
 ('19881', 0.1598285734653473),
 ('24100', -0.09242138266563416)]

In [98]:
similar_doc_cluster_indices = [tup[0] for tup in similar_doc_cluster]
similar_doc_cluster_indices

['21566',
 '3948',
 '20010',
 '25232',
 '3836',
 '23270',
 '15601',
 '2904',
 '9752',
 '4784',
 '21195',
 '6900',
 '20373',
 '6153',
 '3329',
 '25194',
 '22589',
 '21642',
 '20288',
 '24369',
 '6326',
 '14616',
 '19881',
 '24100']

In [99]:
similar_doc_cluster_df = pd.DataFrame(list(similar_doc_cluster), columns=['index','doc2vec'])

In [100]:
similar_doc_cluster_df['index'] = [int(i) for i in similar_doc_cluster_df['index']]

In [101]:
similar_doc_cluster_similarities = [tup[1] for tup in similar_doc_cluster]
similar_doc_cluster_distances = [abs(x - similar_doc_cluster_similarities[i - 1]) for i, x in enumerate(similar_doc_cluster_similarities)][1:]
max_distance = max(similar_doc_cluster_distances)
max_distance

0.4815068542957306

In [102]:
max_distance_index = similar_doc_cluster_distances.index(max_distance)
max_distance_index

21

In [103]:
electronics_single_cluster_id_df['name']

2904           z-man games carcassonne 3 princess & dragon
3329        earthbath ultra-mild wild cherry puppy shampoo
3836                               eric javits phoenix hat
3948           earthbath shampoing sans larmes pour chiots
4784                 macallan carn mor 1989 26yo 42.5% abv
6153          earthbath dog shampoo ultra-mild puppy, 16oz
6326               earthbath ultra mild puppy shampoo 16oz
6900                             star shaped lollipop mold
9752                                     di2 ew-sd50 cable
12364    canon eos r mirrorless digital camera with 24-...
14616                            electric wire,200mm black
15601                          hop cone crown caps - 12 pk
19881                                     taproot magazine
20010           four virtues bourbon barrel aged zinfandel
20288               earthbath earthbath puppy shampoo 16oz
20373                      shimano wire for di2 gear 20 cm
21195           cocktail kingdom shaking tin - gold plat

In [104]:
jaccard_score = electronics_single_cluster_id_df['name'].apply(lambda row: jaccard_similarity_score(row,electronics_single_cluster_id_df['name'].loc[int(index_most_common)]))
jaccard_score

2904     0.500000
3329     0.576923
3836     0.423077
3948     0.560000
4784     0.400000
6153     0.517241
6326     0.464286
6900     0.521739
9752     0.565217
12364    1.000000
14616    0.480000
15601    0.541667
19881    0.416667
20010    0.444444
20288    0.357143
20373    0.695652
21195    0.625000
21566    0.800000
21642    0.576923
22589    0.464286
23270    0.354839
24100    0.304348
24369    0.629630
25194    0.560000
25232    0.555556
Name: name, dtype: float64

In [105]:
jaccard_score_df = pd.DataFrame({'index':jaccard_score.index, 'jaccard':jaccard_score.values})

In [106]:
jaccard_score_df

Unnamed: 0,index,jaccard
0,2904,0.5
1,3329,0.576923
2,3836,0.423077
3,3948,0.56
4,4784,0.4
5,6153,0.517241
6,6326,0.464286
7,6900,0.521739
8,9752,0.565217
9,12364,1.0


In [107]:
similarity_df = pd.merge(similar_doc_cluster_df, jaccard_score_df, left_on='index', right_on='index', how='left')
similarity_df

Unnamed: 0,index,doc2vec,jaccard
0,21566,0.985123,0.8
1,3948,0.856989,0.56
2,20010,0.850787,0.444444
3,25232,0.847732,0.555556
4,3836,0.846255,0.423077
5,23270,0.841576,0.354839
6,15601,0.841501,0.541667
7,2904,0.840388,0.5
8,9752,0.838883,0.565217
9,4784,0.838557,0.4


In [108]:
similarity_df[similarity_df['doc2vec']>0.97]

Unnamed: 0,index,doc2vec,jaccard
0,21566,0.985123,0.8


In [113]:
valid_cluster_id_df = similarity_df[(similarity_df['doc2vec']>0.97) | (similarity_df['jaccard']>0.5)]

In [114]:
valid_cluster_id_indices = valid_cluster_id_df['index'].to_list()
valid_cluster_id_indices

[21566,
 3948,
 25232,
 15601,
 9752,
 21195,
 6900,
 20373,
 6153,
 3329,
 25194,
 21642,
 24369]

In [None]:
electronics_single_cluster_id_df_new = electronics_single_cluster_id_df[electronics_single_cluster_id_df]