In [4]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Check if a GPU is available and use it if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the Excel file
excel_file_path = r'C:\Users\SISTLA RAHUL\Desktop\MLPROJECT.xlsx'  # Update with your actual path
data = pd.read_excel(excel_file_path)

# Check the first few rows of the data
print(data.head())

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=False)
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to get BERT embeddings in batches
def get_bert_embeddings_batch(text_list, batch_size=8):
    all_embeddings = []
    
    # Process data in batches for efficiency
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        
        # Tokenize the input texts in a batch
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Move inputs to the appropriate device (GPU or CPU)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Get the BERT embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Take the mean across all tokens for each text
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist()  # Move to CPU and convert to list
        all_embeddings.extend(batch_embeddings)
    
    return all_embeddings

# Apply the function to the 'Equation' column in the dataset
equations = data['Equation'].astype(str).tolist()
embeddings = get_bert_embeddings_batch(equations, batch_size=16)  # Increase batch size for faster processing

# Convert the list of embeddings into a DataFrame (each embedding element goes into its own column)
embeddings_df = pd.DataFrame(embeddings, columns=[f'embedding_{i+1}' for i in range(len(embeddings[0]))])

# Concatenate the original data with the embeddings DataFrame
data_with_embeddings = pd.concat([data, embeddings_df], axis=1)

# Save the embeddings to a new Excel file
output_file_path = r'C:\Users\SISTLA RAHUL\Desktop\equation_with_embeddings.xlsx'  # Update with your output path
data_with_embeddings.to_excel(output_file_path, index=False)

# Check the first few rows with embeddings
print(data_with_embeddings.head())

print(f"Embeddings have been saved to {output_file_path}")


                                            Equation
0  C/O = 3/2\n C = 50\n 50/O = 3/2\n 100 = 3O\n O...
1  2. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 2O\...
2  3. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 4O\...
3  4. C/O = 3/2\n C = 70\n 70/O = 3/2\n 140 = 3O\...
4  5. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 5O\...
                                            Equation  embedding_1  \
0  C/O = 3/2\n C = 50\n 50/O = 3/2\n 100 = 3O\n O...     0.030083   
1  2. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 2O\...     0.299633   
2  3. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 4O\...     0.339740   
3  4. C/O = 3/2\n C = 70\n 70/O = 3/2\n 140 = 3O\...     0.281296   
4  5. C/O = 3/2\n C = 60\n 60/O = 3/2\n 120 = 5O\...     0.297376   

   embedding_2  embedding_3  embedding_4  embedding_5  embedding_6  \
0    -0.051380     0.595111    -0.092150     0.102375    -0.115757   
1    -0.176748     0.667872    -0.006421     0.106535    -0.224786   
2    -0.096238     0.754441    -0.024798     0.051905    