In [1]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [2]:
import json

def load_json(path: str):
    with open(path, 'r', encoding='utf-8') as json_file:
        try:
            return json.load(json_file)
        except json.JSONDecodeError as e:
            print(f"Error reading {path}: {e}")

In [3]:
def get_embedding(json_as_dict: dict):
    text = json_as_dict["<page title>"]
        
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
    outputs = model(**inputs)
    
    last_hidden_states = outputs.last_hidden_state
    token_embeddings = last_hidden_states[0]

    cls_embedding = token_embeddings[0].detach().numpy()

    return cls_embedding

def get_embedding(json_as_dict: dict):
    text = ""
    for key in json_as_dict:
        value = json_as_dict[key]
        if isinstance(value, list):
            for elem in value:
                text += (" " + elem.lower())
        else:
            text += (" " + value.lower())
        
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    token_embeddings = last_hidden_states[0]

    cls_embedding = token_embeddings[0]
    # mean_embedding = torch.mean(token_embeddings, dim=0)

    return cls_embedding.detach().numpy()

In [4]:
import csv

path2id = dict()
id2embedding = dict()
id2title = dict()

DATASET_FOLDER_PATH = 'dataset/'
GROUND_TRUTH_PATH = DATASET_FOLDER_PATH + 'monitor_entity_resolution_labelled.csv'
SOURCES_PATH = DATASET_FOLDER_PATH + "2013_monitor_specs/"

numberOfZeros = 0
numberOfOnes = 0

positives = []
negatives = []

with open(GROUND_TRUTH_PATH, mode='r') as file:
    csv_reader = csv.reader(file)
    for index, row in enumerate(csv_reader):
        print("\r"+str(index), end="")
        if index == 0:
            columns = row
            continue
        left_folder, left_id = row[0].split("//")
        right_folder, right_id = row[1].split("//")
        label = int(row[2])

        if label == 0: numberOfZeros += 1
        else: numberOfOnes += 1

        left_path = SOURCES_PATH + left_folder + "/" + left_id + ".json"
        right_path = SOURCES_PATH + right_folder + "/" + right_id + ".json"

        if left_path in path2id:
            left_id = path2id[left_path]
        else:
            left_id = len(path2id)
            left_json = load_json(left_path)
            id2embedding[left_id] = get_embedding(left_json)
            id2title[left_id] = left_json["<page title>"]
            path2id[left_path] = left_id

        if right_path in path2id:
            right_id = path2id[right_path]
        else:
            right_id = len(path2id)
            right_json = load_json(right_path)
            id2embedding[right_id] = get_embedding(right_json)
            id2title[right_id] = left_json["<page title>"]
            path2id[right_path] = right_id
        
        if label == 1:
            positives.append([left_id, right_id, label])
        else:
            negatives.append([left_id, right_id, label])

111156

In [5]:
import pickle
import pandas as pd

total_number = len(positives) + len(negatives)
pos = pd.DataFrame(positives, columns=columns)
neg = pd.DataFrame(negatives, columns=columns)
neg = neg.sample(frac=2*(len(positives)/total_number)).reset_index(drop=True)

print(len(pos))
print(len(neg))

result = pd.concat([pos, neg], axis=0, ignore_index=True)
result = result.sample(frac=1).reset_index(drop=True)
print(len(result))

SAVES_FOLDER = "saves/"
result.to_csv(SAVES_FOLDER + 'dataset.csv', index=False)

with open(SAVES_FOLDER + 'id2embedding.pkl', 'wb') as f:
    pickle.dump(id2embedding, f)

with open(SAVES_FOLDER + 'id2title.pkl', 'wb') as f:
    pickle.dump(id2title, f)

1073
2125
3198


In [6]:
print(len(id2embedding[0]))

768
