In [14]:
import torch

In [11]:
from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
with open('DP_Sample.json', 'r') as file:
    data = json.load(file)

tokenized_data = [tokenizer(item, truncation=True, padding=True, return_tensors="pt") for item in data]


In [15]:
embeddings = []

for item in tokenized_data:
    with torch.no_grad():
        outputs = model(**item)
        embeddings.append(outputs.last_hidden_state)


In [16]:
embeddings

[tensor([[[-0.4156,  0.1556,  0.0974,  ..., -0.3049, -0.0580,  0.2635],
          [ 0.0055,  0.4952,  0.5756,  ..., -0.8292, -0.4116, -0.4782],
          [ 0.8726,  0.0712, -0.2783,  ..., -0.1688, -0.8929, -0.3399]]]),
 tensor([[[-0.5760,  0.0350, -0.2270,  ..., -0.1851,  0.0940,  0.3281],
          [-0.4651, -0.4263, -0.2407,  ...,  0.1948,  0.1887,  0.1204],
          [ 0.8748,  0.1016, -0.4328,  ...,  0.1037, -0.5909, -0.2674]]]),
 tensor([[[-0.5507,  0.0664,  0.2064,  ...,  0.0382,  0.1508,  0.3129],
          [-0.5916, -0.6735,  0.1297,  ...,  0.2691,  0.5061,  0.3456],
          [ 0.9693,  0.0079, -0.2108,  ...,  0.1358, -1.0094, -0.1803]]]),
 tensor([[[-0.3529, -0.2390,  0.1322,  ...,  0.0110,  0.1277,  0.2494],
          [-0.3776, -1.1474, -0.0256,  ..., -0.0542, -0.0551,  0.3992],
          [ 0.7580,  0.0903, -0.2518,  ..., -0.0291, -0.9104, -0.3047]]]),
 tensor([[[-0.3530, -0.1299,  0.1074,  ..., -0.3348,  0.1649,  0.5878],
          [-0.0378, -0.6815, -0.1101,  ..., -0.4189,

In [17]:
item_index = 0  
embeddings_for_item = embeddings[item_index]

print("Shape of embeddings:", embeddings_for_item.shape)

print("Embeddings:", embeddings_for_item)


Shape of embeddings: torch.Size([1, 3, 768])
Embeddings: tensor([[[-0.4156,  0.1556,  0.0974,  ..., -0.3049, -0.0580,  0.2635],
         [ 0.0055,  0.4952,  0.5756,  ..., -0.8292, -0.4116, -0.4782],
         [ 0.8726,  0.0712, -0.2783,  ..., -0.1688, -0.8929, -0.3399]]])


In [25]:
with open('DP_Sample.json', 'r') as file:
    data = json.load(file)

text_fields = [
    "title",
    "description",
    "tags",  
    "document",
]

data_sources = data.get("data_sources", [])

for item in data_sources:
    text_fields.extend([item["title"], item["description"]])

embeddings_dict = {}

for field in text_fields:
    text = data.get(field, "")  

    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state[0]  

    embeddings_list = embeddings.tolist()

    embeddings_dict[field] = embeddings_list

with open("field_embeddings.json", "w") as file:
    json.dump(embeddings_dict, file)



In [38]:
def text2vec(text):
    
    
    return vec

In [47]:
import torch
import json

def calculate_field_medians(data, text_fields, tokenizer, model):
    embeddings_dict = {}

    for field in text_fields:
        text = data.get(field, "")

        input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")

        with torch.no_grad():
            outputs = model(input_ids)
            embeddings = outputs.last_hidden_state[0]

        median_embedding = torch.median(embeddings, dim=0).values.unsqueeze(0)
        embeddings_list = median_embedding.tolist()

        embeddings_dict[field] = embeddings_list

    return embeddings_dict


with open('DP_Sample.json', 'r') as file:
    data = json.load(file)

text_fields = [
    "title",
    "description",
    "tags",  
    "document",
]

data_sources = data.get("data_sources", [])

for item in data_sources:
    text_fields.extend([item["title"], item["description"]])

field_embeddings = calculate_field_medians(data, text_fields, tokenizer, model)

with open("field_medians.json", "w") as file:
    json.dump(field_embeddings, file)
