In [None]:
import pandas as pd
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np
import pickle

In [None]:
model = api.load('word2vec-google-news-300')

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import json

def load_json(path: str):
    with open(path, 'r', encoding='utf-8') as json_file:
        try:
            return json.load(json_file)
        except json.JSONDecodeError as e:
            print(f"Error reading {path}: {e}")

In [None]:
def get_embedding(json_as_dict: dict):
    tokens_embedding = []
    for key in json_as_dict:
        value = json_as_dict[key]
        
        if isinstance(value, list):
            tokens = []
            for elem in value:
                tokens.extend(word_tokenize(elem.lower()))
        else:
            tokens = word_tokenize(value.lower())

        for token in tokens:
            if token in model:
               tokens_embedding.append(model[token])
    
    embedding = np.mean(tokens_embedding, axis=0) if tokens_embedding else np.zeros(model.vector_size)

    return np.array(embedding)

In [None]:
import csv

dataset_entries = []

path2id = dict()
id2embedding = dict()

GROUND_TRUTH_PATH = 'dataset/monitor_entity_resolution_labelled.csv'
DATASET_PATH = "dataset/2013_monitor_specs/"

numberOfZeros = 0
numberOfOnes = 0

with open(GROUND_TRUTH_PATH, mode='r') as file:
    csv_reader = csv.reader(file)
    for index, row in enumerate(csv_reader):
        print("\r"+str(index), end="")
        if index == 0:
            dataset_entries.append(row)
            continue
        left_folder, left_id = row[0].split("//")
        right_folder, right_id = row[1].split("//")
        label = int(row[2])

        if label == 0: numberOfZeros += 1
        else: numberOfOnes += 1

        left_path = DATASET_PATH + left_folder + "/" + left_id + ".json"
        right_path = DATASET_PATH + right_folder + "/" + right_id + ".json"

        if left_path in path2id:
            left_id = path2id[left_path]
        else:
            left_id = len(path2id)
            id2embedding[left_id] = get_embedding(load_json(left_path))
            path2id[left_path] = left_id

        if right_path in path2id:
            right_id = path2id[right_path]
        else:
            right_id = len(path2id)
            id2embedding[right_id] = get_embedding(load_json(right_path))
            path2id[right_path] = right_id

        dataset_entries.append([left_id, right_id, label])
dataset = pd.DataFrame(dataset_entries[1:], columns=dataset_entries[0])
shuffled_dataset = dataset.sample(frac=1).reset_index(drop=True)
shuffled_dataset.to_csv('dataset.csv', index=False)

print()
print(numberOfZeros)
print(numberOfOnes)

with open('id2embedding.pkl', 'wb') as f:
    pickle.dump(id2embedding, f)