In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Use CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

In [None]:
# Function to generate BERT embeddings
def generate_bert_embeddings(text_batch, model, tokenizer):
    inputs = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():  
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()


In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Putting the model in inference mode since fine-tuning is not required. Just using the pre-trained knowledge for generating embeddings
model.eval() 


In [None]:
# Load your dataset
df = pd.read_csv(r'/path/to/your/dataset.csv')
texts = df['text'].tolist()
labels = df['label'].tolist()

In [None]:
batch_size = 16
embeddings = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_embeddings = generate_bert_embeddings(batch_texts, model, tokenizer)
    embeddings.extend(batch_embeddings)
    # Clearing the cache to avoid memory issues
    torch.cuda.empty_cache() 

In [None]:
# Save embeddings and labels to a file
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['label'] = labels  # Add labels column
embeddings_df.to_csv('/dataset/name.csv', index=False)