In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
from sentence_transformers import SentenceTransformer

# Read training data
training_data = pd.read_excel("/content/testing.xlsx")

# Initialize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', output_hidden_states=True)
model = GPT2Model.from_pretrained("gpt2")

# Initialize sentence transformer model
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Preprocess 'Equation' column
training_data['Equation'] = training_data['Equation'].str.replace('\n', ' ')
training_data['Equation'] = training_data['Equation'].fillna('')

# Tokenize and get embeddings for equations
training_data['embeddings'] = training_data['Equation'].apply(lambda x: sentence_model.encode(x) if x != '' else None)
training_data = training_data.dropna(subset=['embeddings'])

# Create a DataFrame from the embeddings
math1 = pd.DataFrame(training_data['embeddings'].tolist(), index=training_data.index).add_prefix('embed_')

# Add the 'output' column
math1['output'] = training_data['output'].values

# Save the results to an Excel file
training_output_file = "testing_gpt2_reordered.xlsx"
math1.to_excel(training_output_file, index=False)
print(math1)