In [4]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.preprocessing import MinMaxScaler

# Load Excel file
file_path = r"C:\Users\SISTLA RAHUL\Desktop\MLPROJECT.xlsx"
data = pd.read_excel(file_path)

# Initialize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

# Set pad token to eos token to avoid padding error
tokenizer.pad_token = tokenizer.eos_token

# Define a function to generate GPT-2 embeddings
def get_gpt2_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", 
                       truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state.mean(dim=1).squeeze()
    return outputs.numpy()

# Generate embeddings for each row and store them
data['GPT2_Embedding'] = data.apply(lambda row: get_gpt2_embedding(str(row.values)), axis=1)

# Convert the embeddings to a DataFrame and scale them to the range of -1 to 1
embeddings_df = pd.DataFrame(data['GPT2_Embedding'].tolist())

# Scale the embeddings to be between -1 and 1
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_embeddings = scaler.fit_transform(embeddings_df)

# Create a new DataFrame with the scaled embeddings
scaled_embeddings_df = pd.DataFrame(scaled_embeddings, columns=[f'Embedding_{i+1}' for i in range(768)])

# Concatenate the original DataFrame with the scaled embeddings
final_data = pd.concat([data, scaled_embeddings_df], axis=1)

# Save embeddings back to Excel
output_path = r"C:\Users\SISTLA RAHUL\Desktop\MLPROJECT_with_embeddings.xlsx"
final_data.to_excel(output_path, index=False)

print(f"Embeddings saved to {output_path}")




Embeddings saved to C:\Users\SISTLA RAHUL\Desktop\MLPROJECT_with_embeddings.xlsx
