In [1]:
pip install transformers torch pandas

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m620.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-cp312-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-cp312-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.23.3-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m6.2 MB/s[0m eta 

##### Vectorizing Data Using BERT-Based Model

In [2]:
import json
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

def bert_vectorized(input_file_path, output_file_path):
    # Load the tokenized text from the JSON file
    with open(input_file_path, 'r') as file:
        tokenized_texts = json.load(file)
    
    # Initialize BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Ensure the model is in evaluation mode
    model.eval()

    # Function to get BERT embeddings
    def get_bert_embedding(tokens):
        # Tokenize and create tensor
        inputs = tokenizer(tokens, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Get the embeddings from the last hidden state
        embeddings = outputs.last_hidden_state
        # Take the mean of the token embeddings to get a sentence-level embedding
        sentence_embedding = torch.mean(embeddings, dim=1)
        return sentence_embedding.squeeze().numpy()

    # Process each tokenized text and get embeddings
    embeddings = []
    for tokens in tokenized_texts:
        embedding = get_bert_embedding(tokens)
        embeddings.append(embedding)

    # Convert embeddings to DataFrame and save as CSV
    df = pd.DataFrame(embeddings)
    df.to_csv(output_file_path, index=False)

# Example usage
input_file_path = 'tokenized_300.json'
output_file_path = 'vectorized_bert_300.csv'
bert_vectorized(input_file_path, output_file_path)


  from .autonotebook import tqdm as notebook_tqdm
