In [None]:
import os
import pandas as pd

# Set the paths for the input directory containing text files and the output directory for saving DataFrames
input_directory = 'DivSumm/input_docs'
output_directory = 'input_docs/docs'

# Get a list of all text files in the input directory
file_list = [file for file in os.listdir(input_directory) if file.endswith('.txt')]

# Process each text file and save the resulting DataFrame
for file in file_list:
    filename = file[:-4]
    # Construct the input file path
    input_file_path = os.path.join(input_directory, file)
    
    # Read the text file into a DataFrame
    with open(input_file_path, 'r', encoding='latin-1') as file:
        lines = file.readlines()
    data = [line.strip().split('\t') for line in lines]
    df = pd.DataFrame(data)
    
    # Optionally, set column names if your text file doesn't have headers
    df.columns = ['label', 'text']

    # Filter rows based on the 'label' column values
    allowed_labels = ['Hisp', 'White', 'AA']
    df = df[df['label'].isin(allowed_labels)]
    
    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)
    
    # Construct the output file path
    output_file_path = os.path.join(output_directory, f'{filename}.csv')

    # Save the DataFrame as a CSV file
    df.to_csv(output_file_path, index=False)
    
    # Print the status
    print(f"Processed '{filename}' and saved as '{output_file_path}'")

In [None]:
import os
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Load the BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model = model.to(device)

# Set the paths for the input directory containing CSV files and the output directory for saving DataFrames
input_directory = 'input_docs/docs'
output_directory = 'input_docs/embedding'

# Get a list of all CSV files in the input directory
file_list = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

# Process each CSV file and save the resulting DataFrame
for file in file_list:
    filename = file[:-4]  # Remove the file extension
    # Construct the input file path
    input_file_path = os.path.join(input_directory, file)
    
    # Read the CSV file into a DataFrame
    input_data = pd.read_csv(input_file_path)
    
    # Replace labels with numeric values
    input_data['label'] = input_data['label'].replace({'White': 0, 'Hisp': 1, 'AA': 2})
    
    # Tokenize the text inputs
    tokenized_inputs = tokenizer(input_data['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    tokenized_inputs = tokenized_inputs.to(device)
    
    # Obtain BERT embeddings
    with torch.no_grad():
        tokenized_inputs = tokenized_inputs.to(device)
        outputs = model(**tokenized_inputs)
        embeddings = outputs.pooler_output
    
    # Move embeddings back to CPU if needed
    embeddings = embeddings.cpu()
    
    # Create the DataFrame
    data = {'label': input_data['label']}
    for i in range(embeddings.shape[1]):
        data[f'embedding_{i+1}'] = embeddings[:, i].tolist()
    
    df = pd.DataFrame(data)
    
    # Construct the output file path
    output_file_path = os.path.join(output_directory, f'{filename}.csv')
    
    # Save the DataFrame as a CSV file
    df.to_csv(output_file_path, index=False)
    
    # Print the status
    print(f"Processed '{filename}' and saved as '{output_file_path}'")