In [None]:
import pandas as pd
import json 
import gzip
import os

import nltk
from nltk.corpus import stopwords
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [None]:
data_path = '../src/data'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
notebook_path = '../notebooks'
product_path = os.path.join(data_path, 'product')

In [None]:
cluster_df = pd.read_json(os.path.join(mapping_corpus_path_2, 'df_large_matched.json'), compression='gzip', orient='records', lines=True)

In [None]:
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics_clusters_15_tables.csv'), index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

In [None]:
clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes_clusters_10_tables.csv'), index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [None]:
electronics_clusters_all_15_df = cluster_df[cluster_df['cluster_id'].isin(electronics_final_entities_list)]
clothes_clusters_all_10_df = cluster_df[cluster_df['cluster_id'].isin(clothes_final_entities_list)]

electronics_clusters_all_15_df.to_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_15_tables.csv'), columns=None)
clothes_clusters_all_10_df.to_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_10_tables.csv'), columns=None)

# Run from here

In [None]:
def remove_stopwords(token_vector, stopwords_list):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in stopwords_list])

def remove_punctuation(token_vector):
    return token_vector.apply(lambda token_list: [word for word in token_list if word not in string.punctuation])

def jaccard_similarity_score(original, translation):
    intersect = set(original).intersection(set(translation))
    union = set(original).union(set(translation))
    try:
        return len(intersect) / len(union)
    except ZeroDivisionError:
        return 0

In [None]:
# read final dataframes with all cluster_ids left for electronics and clothes
electronics_clusters_all_15_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'electronics_clusters_all_15_tables.csv'), index_col=None)
clothes_clusters_all_10_df = pd.read_csv(os.path.join(mapping_corpus_path_2, 'clothes_clusters_all_10_tables.csv'), index_col=None)

In [None]:
# generate lists for final cluster_ids for electronics and clothes
electronics_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'electronics_clusters_15_tables.csv'),index_col=None)
electronics_final_entities_list = electronics_final_entities_df['cluster_id']

clothes_final_entities_df = pd.read_csv(os.path.join(notebook_path, 'clothes_clusters_10_tables.csv'),index_col=None)
clothes_final_entities_list = clothes_final_entities_df['cluster_id']

In [None]:
# generate lists for valid electronics and clothes brands
with open(os.path.join(product_path, 'brands_dict.json'), 'r', encoding='utf-8') as f:
    brands_dict = json.load(f)

electronics_valid_brands = brands_dict['electronics_total']
clothes_valid_brands = brands_dict['clothes']

In [None]:
# lowercase name column for similarity measure
electronics_clusters_all_15_df['name'] = electronics_clusters_all_15_df['name'].apply(lambda row: str(row).lower())
clothes_clusters_all_10_df['name'] = clothes_clusters_all_10_df['name'].apply(lambda row: str(row).lower())

In [None]:
# use tokenizer for name column to get tokens for training the model, remove stopwords and punctuation
electronics_clusters_all_15_df['tokens'] = electronics_clusters_all_15_df['name'].apply(lambda row: word_tokenize(row))
electronics_clusters_all_15_df['tokens'] = remove_stopwords(electronics_clusters_all_15_df['tokens'], stopwords.words())
electronics_clusters_all_15_df['tokens'] = remove_punctuation(electronics_clusters_all_15_df['tokens'])

clothes_clusters_all_10_df['tokens'] = clothes_clusters_all_10_df['name'].apply(lambda row: word_tokenize(row))
clothes_clusters_all_10_df['tokens'] = remove_stopwords(clothes_clusters_all_10_df['tokens'],stopwords.words())
clothes_clusters_all_10_df['tokens'] = remove_punctuation(clothes_clusters_all_10_df['tokens'])

In [None]:
# get tagged words
tagged_data_electronics = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(electronics_clusters_all_15_df['tokens'])]
tagged_data_clothes = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(clothes_clusters_all_10_df['tokens'])]

In [None]:
# build model and vocabulary for electronics (do same for clothes later)
model_electronics = Doc2Vec(vector_size=50, min_count=5, epochs=25, dm=0)
model_electronics.build_vocab(tagged_data_electronics)
# Train model
model_electronics.train(tagged_data_electronics, total_examples=model_electronics.corpus_count, epochs=25)

### Change index label for testing here

In [None]:
electronics_single_cluster_id_df = electronics_clusters_all_15_df[electronics_clusters_all_15_df['cluster_id']==6443]

In [None]:
valid_brands = list(filter(lambda brand: brand in electronics_valid_brands, electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower())))
valid_brands

In [None]:
most_common_brand = max(valid_brands, key=valid_brands.count)
most_common_brand

In [None]:
index_most_common = electronics_single_cluster_id_df[electronics_single_cluster_id_df['brand'].apply(lambda element: str(element).lower()) == most_common_brand].index[0]
index_most_common

In [None]:
similar_doc = model_electronics.docvecs.most_similar(f'{index_most_common}', topn=electronics_clusters_all_15_df.shape[0])
similar_doc_cluster = [tup for tup in similar_doc if int(tup[0]) in list(electronics_single_cluster_id_df.index)]
similar_doc_cluster

In [None]:
similar_doc_cluster_similarities = [tup[1] for tup in similar_doc_cluster]
similar_doc_cluster_distances = [abs(x - similar_doc_cluster_similarities[i - 1]) for i, x in enumerate(similar_doc_cluster_similarities)][1:]
max_distance = max(similar_doc_cluster_distances)
max_distance

In [None]:
max_distance_index = similar_doc_cluster_distances.index(max_distance)
max_distance_index

In [None]:
electronics_single_cluster_id_df['name']

In [None]:
jaccard_score = electronics_single_cluster_id_df['name'].apply(lambda row: jaccard_similarity_score(row,electronics_single_cluster_id_df['name'].loc[int(index_most_common)]))
jaccard_score