In [7]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load data from the CSV file
url = 'Q3.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Ensure the tokenizer uses padding tokens, as GPT-2 does not have one by default
tokenizer.pad_token = tokenizer.eos_token

# Function to compute GPT-2 embeddings for text
def get_gpt2_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract GPT-2 embeddings for the 'Input' column
gpt2_input_embeddings = data['Input'].apply(lambda text: get_gpt2_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(gpt2_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(gpt2_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q3_gpt2.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'GPT-2 embeddings saved to {output_file}')


GPT-2 embeddings saved to Q3_gpt2.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Load data from the CSV file
url = 'Q3.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to compute RoBERTa embeddings for text
def get_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract RoBERTa embeddings for the 'Input' column
roberta_input_embeddings = data['Input'].apply(lambda text: get_roberta_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q3_roberta.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'RoBERTa embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load data from the CSV file
url = 'Q3.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings for text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract BERT embeddings for the 'Input' column
bert_input_embeddings = data['Input'].apply(lambda text: get_bert_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(bert_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(bert_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q3_bert.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'BERT embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load data from the CSV file
url = 'Q2.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Ensure the tokenizer uses padding tokens, as GPT-2 does not have one by default
tokenizer.pad_token = tokenizer.eos_token

# Function to compute GPT-2 embeddings for text
def get_gpt2_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract GPT-2 embeddings for the 'Input' column
gpt2_input_embeddings = data['Input'].apply(lambda text: get_gpt2_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(gpt2_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(gpt2_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q2_gpt2.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'GPT-2 embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Load data from the CSV file
url = 'Q2.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to compute RoBERTa embeddings for text
def get_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract RoBERTa embeddings for the 'Input' column
roberta_input_embeddings = data['Input'].apply(lambda text: get_roberta_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q2_roberta.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'RoBERTa embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load data from the CSV file
url = 'Q2.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings for text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract BERT embeddings for the 'Input' column
bert_input_embeddings = data['Input'].apply(lambda text: get_bert_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(bert_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(bert_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q2_bert.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'BERT embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load data from the CSV file
url = 'Q1.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Ensure the tokenizer uses padding tokens, as GPT-2 does not have one by default
tokenizer.pad_token = tokenizer.eos_token

# Function to compute GPT-2 embeddings for text
def get_gpt2_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract GPT-2 embeddings for the 'Input' column
gpt2_input_embeddings = data['Input'].apply(lambda text: get_gpt2_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(gpt2_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(gpt2_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q1_gpt2.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'GPT-2 embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Load data from the CSV file
url = 'Q1.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to compute RoBERTa embeddings for text
def get_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract RoBERTa embeddings for the 'Input' column
roberta_input_embeddings = data['Input'].apply(lambda text: get_roberta_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q1.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'RoBERTa embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load data from the CSV file
url = 'Q1.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings for text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract BERT embeddings for the 'Input' column
bert_input_embeddings = data['Input'].apply(lambda text: get_bert_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(bert_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(bert_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'Q1_bert.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'BERT embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load data from the CSV file
url = 'test_dataset_2.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings for text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract BERT embeddings for the 'Input' column
bert_input_embeddings = data['Input'].apply(lambda text: get_bert_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(bert_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(bert_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'bert_test_2.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'BERT embeddings saved to {output_file}')


In [None]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load data from the CSV file
url = 'test_dataset_2.csv'
data = pd.read_csv(url)  # Load data from 'test_dataset_2.csv'

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Ensure the tokenizer uses padding tokens, as GPT-2 does not have one by default
tokenizer.pad_token = tokenizer.eos_token

# Function to compute GPT-2 embeddings for text
def get_gpt2_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Extract GPT-2 embeddings for the 'Input' column
gpt2_input_embeddings = data['Input'].apply(lambda text: get_gpt2_embedding(str(text)))

# Convert the list of embeddings into a DataFrame and name the columns
embeddings_df = pd.DataFrame(gpt2_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(gpt2_input_embeddings.iloc[0].size)])

# Add the 'Output' column from the original DataFrame
embeddings_df['output'] = data['Output']

# Export embeddings to CSV
output_file = 'gpt2_test_2.csv'
embeddings_df.to_csv(output_file, index=False)

print(f'GPT-2 embeddings saved to {output_file}')
