### **Loading Libraries**

In [1]:
import torch
from transformers import BertTokenizer, BertModel
import logging
logging.basicConfig(level=logging.INFO) 

  from .autonotebook import tqdm as notebook_tqdm


### Loading Model to be Used

In [3]:
model_name = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
print(device)

cuda


### **Comparison Function**

### **Looping Mechanism**

In [4]:
#load csv
import pandas as pd
import numpy as np
import os

#load csv
df = pd.read_csv('multi_news/sample_train.csv')
print(df.head())

                                           documents  num_documents  \
0  ['A Ware Police patrol car passes the yellow h...              4   
1  ['Significance \n \n A large body of medical r...              5   
2  ['A Syrian warplane has destroyed a petrol sta...              4   
3  ['A political cartoon, published in a newspape...              7   
4  ['Megyn Kelly takes issue with \'NYT\' book re...              4   

                                             summary  
0  – A defrocked priest at the heart of Boston's ...  
1  – Women suffering from a heart attack seem to ...  
2  – A new UN analysis finds that at least 60,000...  
3  – The first government shutdown since the Clin...  
4  – Megyn Kelly's memoir is out next week, and s...  


In [4]:
texts = [
    "Example text 1",
    "Another example text with a longer sequence.",
    "Short text",
]

# Move the model to the specified device
model.to(device)
model.eval()

# Define the maximum token length you want to limit to
max_token_length = 20  # You can adjust this based on your requirements

# Create an empty dictionary to store the results
encoded_texts = {}

# Loop through each text and encode it
for text in texts:
    # Tokenize the text and ensure it doesn't exceed max_token_length
    tokenized_text = tokenizer(text, truncation=True, padding="max_length", max_length=max_token_length, return_tensors="pt")
    
    # Move the tokenized input to the same device as the model
    tokenized_text = tokenized_text.to(device)

    # Pass the tokenized input through the BERT model
    with torch.no_grad():
        outputs = model(**tokenized_text)

    # Extract the embeddings from the model output
    embeddings = outputs.last_hidden_state  # This contains the embeddings

    # Store the text and its corresponding embeddings in the dictionary
    encoded_texts[text] = embeddings

In [5]:
print(encoded_texts)

{'Example text 1': tensor([[[-0.3035,  0.0985, -0.1054,  ..., -0.3088,  0.1413,  0.6364],
         [-0.3649,  0.3467, -0.8662,  ..., -0.1882,  0.8155,  0.0166],
         [-0.0537,  0.2979,  0.3368,  ..., -0.3639,  0.1457,  0.4011],
         ...,
         [-0.5190, -0.5021,  0.1325,  ..., -0.0988,  0.2699,  0.1380],
         [-0.3855, -0.3380,  0.2266,  ..., -0.1350,  0.2796,  0.0815],
         [-0.5652, -0.4774,  0.0059,  ..., -0.0934,  0.2305,  0.0593]]],
       device='cuda:0'), 'Another example text with a longer sequence.': tensor([[[-0.3991, -0.2916, -0.1429,  ..., -0.4051,  0.2791,  0.7713],
         [-0.4444, -1.0624, -0.3955,  ..., -0.1943,  1.1572,  0.2617],
         [-0.7537,  0.1376, -0.3634,  ..., -0.6728,  0.3218,  0.1036],
         ...,
         [-0.0665, -0.2493,  0.0926,  ..., -0.2028,  0.1874,  0.2085],
         [-0.1050, -0.3938,  0.0872,  ..., -0.0827,  0.2565,  0.2451],
         [-0.7000, -1.0117, -0.4994,  ...,  0.4737,  0.4898,  0.1619]]],
       device='cuda:0'),