In [2]:
import pandas as pd
import json 
import gzip
import os

import nltk
from nltk.corpus import stopwords
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [3]:
data_path = '../src/data'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
notebook_path = '../notebooks'
product_path = os.path.join(data_path, 'product')

In [3]:
cluster_df = pd.read_json(os.path.join(mapping_corpus_path_2, 'df_large_matched.json'), compression='gzip', orient='records', lines=True)

In [3]:
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics_clusters_15_tables.csv'), index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

In [4]:
clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes_clusters_10_tables.csv'), index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [5]:
electronics_clusters_all_15_df = cluster_df[cluster_df['cluster_id'].isin(electronics_final_entities_list)]

NameError: name 'cluster_df' is not defined

In [None]:
clothes_clusters_all_10_df = cluster_df[cluster_df['cluster_id'].isin(clothes_final_entities_list)]

In [8]:
electronics_clusters_all_15_df.to_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_15_tables.csv'), columns=None)

In [9]:
clothes_clusters_all_10_df.to_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_10_tables.csv'), columns=None)

## Run from here

In [4]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

In [5]:
def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

In [6]:
# read final dataframes with all cluster_ids left for electronics and clothes
electronics_clusters_all_15_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_15_tables.csv'), index_col=None)
clothes_clusters_all_10_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_10_tables.csv'), index_col=None)

In [7]:
# generate lists for final cluster_ids for electronics and clothes
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics_clusters_15_tables.csv'),index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes_clusters_10_tables.csv'),index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [8]:
# generate lists for valid electronics and clothes brands
with open(os.path.join(product_path, 'brands_dict.json'), 'r', encoding='utf-8') as f:
    brands_dict = json.load(f)

electronics_valid_brands = brands_dict['electronics_total']
clothes_valid_brands = brands_dict['clothes']

In [9]:
# lowercase name column for similarity measure
electronics_clusters_all_15_df['name'] = electronics_clusters_all_15_df['name'].apply(lambda row: str(row).lower())
clothes_clusters_all_10_df['name'] = clothes_clusters_all_10_df['name'].apply(lambda row: str(row).lower())

In [10]:
# use tokenizer for name column to get tokens for training the model, remove stopwords and punctuation
electronics_clusters_all_15_df['tokens'] = electronics_clusters_all_15_df['name'].apply(lambda row: word_tokenize(row))
electronics_clusters_all_15_df['tokens'] = remove_stopwords(electronics_clusters_all_15_df['tokens'], stopwords.words())
electronics_clusters_all_15_df['tokens'] = remove_punctuation(electronics_clusters_all_15_df['tokens'])

clothes_clusters_all_10_df['tokens'] = clothes_clusters_all_10_df['name'].apply(lambda row: word_tokenize(row))
clothes_clusters_all_10_df['tokens'] = remove_stopwords(clothes_clusters_all_10_df['tokens'],stopwords.words())
clothes_clusters_all_10_df['tokens'] = remove_punctuation(clothes_clusters_all_10_df['tokens'])

In [11]:
# get tagged words
tagged_data_electronics = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(electronics_clusters_all_15_df['tokens'])]
tagged_data_clothes = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(clothes_clusters_all_10_df['tokens'])]

In [12]:
# build model and vocabulary for electronics (do same for clothes later)
model_electronics = Doc2Vec(vector_size=50, min_count=5, epochs=25, dm=0)
model_electronics.build_vocab(tagged_data_electronics)
# Train model
model_electronics.train(tagged_data_electronics, total_examples=model_electronics.corpus_count, epochs=25)

### Change index label for testing here

In [122]:
electronics_single_cluster_id_df = electronics_clusters_all_15_df[electronics_clusters_all_15_df['cluster_id']==6443]

In [123]:
valid_brands = list(filter(lambda brand: brand in electronics_valid_brands, electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower())))

In [124]:
valid_brands

['canon']

In [125]:
most_common_brand = max(valid_brands, key=valid_brands.count)

In [126]:
most_common_brand

'canon'

In [127]:
index_most_common = electronics_single_cluster_id_df[electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower()) == most_common_brand].index[0]

In [128]:
index_most_common

9821

In [129]:
similar_doc = model_electronics.docvecs.most_similar(f'{index_most_common}', topn=electronics_clusters_all_15_df.shape[0])

In [130]:
similar_doc_cluster = [tup for tup in similar_doc if int(tup[0]) in list(electronics_single_cluster_id_df.index)]

In [131]:
similar_doc_cluster

[('17211', 0.9915350675582886),
 ('7698', 0.9530832767486572),
 ('19449', 0.945699155330658),
 ('2297', 0.9418596029281616),
 ('5358', 0.9416583776473999),
 ('20071', 0.9407882690429688),
 ('17273', 0.9397830963134766),
 ('16897', 0.9396624565124512),
 ('20102', 0.9390707015991211),
 ('4816', 0.9385650157928467),
 ('16215', 0.9385466575622559),
 ('2626', 0.9383862614631653),
 ('18583', 0.9380822777748108),
 ('18035', 0.9378562569618225),
 ('15932', 0.9371479153633118),
 ('3016', 0.9366330504417419),
 ('12465', 0.9360285401344299),
 ('3085', 0.935420572757721),
 ('16165', 0.9329388737678528),
 ('3727', 0.9277349710464478),
 ('4926', 0.8776125907897949),
 ('11676', 0.8206188082695007),
 ('19230', 0.07203356921672821),
 ('15834', -0.0624079592525959)]

In [132]:
similar_doc_cluster_similarities = [tup[1] for tup in similar_doc_cluster]

In [133]:
similar_doc_cluster_distances = [abs(x - similar_doc_cluster_similarities[i - 1]) for i, x in enumerate(similar_doc_cluster_similarities)][1:]

In [134]:
max_distance = max(similar_doc_cluster_distances)
max_distance

0.7485852390527725

In [135]:
max_distance_index = similar_doc_cluster_distances.index(max_distance)
max_distance_index

21

In [136]:
def jaccard_similarity_score(original, translation):
    intersect = set(original).intersection(set(translation))
    union = set(original).union(set(translation))
    try:
        return len(intersect) / len(union)
    except ZeroDivisionError:
        return 0

In [137]:
electronics_single_cluster_id_df['name']

2297           z-man games carcassonne 3 princess & dragon
2626        earthbath ultra-mild wild cherry puppy shampoo
3016                               eric javits phoenix hat
3085           earthbath shampoing sans larmes pour chiots
3727                 macallan carn mor 1989 26yo 42.5% abv
4816          earthbath dog shampoo ultra-mild puppy, 16oz
4926               earthbath ultra mild puppy shampoo 16oz
5358                             star shaped lollipop mold
7698                                     di2 ew-sd50 cable
9821     canon eos r mirrorless digital camera with 24-...
11676                            electric wire,200mm black
12465                          hop cone crown caps - 12 pk
15834                                     taproot magazine
15932           four virtues bourbon barrel aged zinfandel
16165               earthbath earthbath puppy shampoo 16oz
16215                      shimano wire for di2 gear 20 cm
16897           cocktail kingdom shaking tin - gold plat

In [138]:
jaccard_score = electronics_single_cluster_id_df['name'].apply(lambda row: jaccard_similarity_score(row,electronics_single_cluster_id_df['name'].loc[int(index_most_common)]))

In [139]:
jaccard_score = jaccard_score.drop(int(index_most_common)).sort_values(ascending=False)

In [140]:
jaccard_score

17211    0.800000
16215    0.695652
19449    0.629630
16897    0.625000
17273    0.576923
2626     0.576923
7698     0.565217
20071    0.560000
3085     0.560000
20102    0.555556
12465    0.541667
5358     0.521739
4816     0.517241
2297     0.500000
11676    0.480000
4926     0.464286
18035    0.464286
15932    0.444444
3016     0.423077
15834    0.416667
3727     0.400000
16165    0.357143
18583    0.354839
19230    0.304348
Name: name, dtype: float64