In [1]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

In [2]:
# Load data from Sheet 1 in the Excel file (replace 'url' with the actual file path)
url = 'Book.xlsx'
sheet1_data = pd.read_excel(url, sheet_name='Sheet1')  # Load specifically 'Sheet1'

In [3]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings for text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()


In [4]:
# Extract BERT embeddings for the 'Input' column from Sheet 1
bert_input_embeddings = sheet1_data['Input'].apply(lambda text: get_bert_embedding(str(text)))

In [5]:
# Create a DataFrame to store the embeddings
# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(bert_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(bert_input_embeddings.iloc[0].size)])

In [6]:
# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = sheet1_data['Output']

# Export embeddings to Excel
output_file = 'bert_embeddings_input_output_custom_columns.xlsx'
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
    embeddings_df.to_excel(writer, sheet_name='BERT Embeddings', index=False)

print(f'BERT embeddings saved to {output_file}')

BERT embeddings saved to bert_embeddings_input_output_custom_columns.xlsx
