In [4]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

In [5]:
DATASET_PATH = "dataset2/"
GROUND_TRUTH_PATH = DATASET_PATH + 'Amzon_GoogleProducts_perfectMapping.csv'
AMAZON_PATH = DATASET_PATH + "Amazon.csv"
GOOGLE_PATH = DATASET_PATH + "GoogleProducts.csv"

In [6]:
import csv

def entry2text(entry):
    text = ""
    for elem in entry:
        text += (" " + elem)
    return text

def load_dataset(path: str):
    dataset = dict()
    with open(path, mode='r', encoding='ISO-8859-1') as file:
        csv_reader = csv.reader(file)
        for index, row in enumerate(csv_reader):
            print("\r"+str(index), end="")
            if index == 0:
                columns = row[1:]
                continue
            id = row[0]
            dataset[id] = entry2text(row[1:])

    return dataset, columns

google_id2entry, google_columns = load_dataset(GOOGLE_PATH)
amazon_id2entry, amazon_columns = load_dataset(AMAZON_PATH)

1363

In [7]:
def get_embedding(text: str):        
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
    outputs = model(**inputs)
    
    last_hidden_states = outputs.last_hidden_state
    token_embeddings = last_hidden_states[0]

    cls_embedding = token_embeddings[0].detach().numpy()

    return cls_embedding

def compute_embeddings(id2entry: dict) -> dict:
    id2embedding = dict()
    for i,id in enumerate(id2entry):
        print("\r"+str(i), end="")
        embedding = get_embedding(id2entry[id])
        id2embedding[id] = embedding
    print()
    return id2embedding

In [8]:
amazon_id2embedding = compute_embeddings(amazon_id2entry)

1362


In [9]:
google_id2embedding = compute_embeddings(google_id2entry)

3225


In [10]:
import pickle

SAVES_FOLDER = "saves2/"

with open(SAVES_FOLDER + 'amazon_id2embedding.pkl', 'wb') as f:
    pickle.dump(amazon_id2embedding, f)

with open(SAVES_FOLDER + 'google_id2embedding.pkl', 'wb') as f:
    pickle.dump(google_id2embedding, f)

id2embedding = dict()
for id in amazon_id2embedding:
    id2embedding[id] = amazon_id2embedding[id]

for id in google_id2embedding:
    id2embedding[id] = google_id2embedding[id]

with open(SAVES_FOLDER + 'id2embedding.pkl', 'wb') as f:
    pickle.dump(id2embedding, f)

In [11]:
import random

positives = set()

with open(GROUND_TRUTH_PATH, mode='r', encoding='ISO-8859-1') as file:
    csv_reader = csv.reader(file)
    for index, row in enumerate(csv_reader):
        if index == 0:
            columns = row
            continue
        row = (row[0], row[1])
        positives.add(row)

negatives = set()
amazon_ids = list(amazon_id2entry.keys())
google_ids = list(google_id2entry.keys())

while len(negatives) < 2*len(positives):
    google_id = random.choice(google_ids)
    amazon_id = random.choice(amazon_ids)
    entry = (amazon_id, google_id)
    if (entry in positives): continue
    if (entry in negatives): continue
    negatives.add(entry)
    print("\r"+str(len(negatives)), end="")

print()
print(len(positives))
print(len(negatives))

2600
1300
2600


In [12]:
import pandas as pd

result = []
for amazon_id, google_id in positives:
    result.append((amazon_id, google_id, 1))
for amazon_id, google_id in negatives:
    result.append((amazon_id, google_id, 0))

print(len(result))

3900


In [13]:
result = pd.DataFrame(result, columns=["left_spec_id", "right_spec_id", "label"])
result = result.sample(frac=1).reset_index(drop=True)
print(len(result))

result.to_csv(SAVES_FOLDER + 'dataset.csv', index=False)

3900
