In [1]:
import csv
import openai
import time
import random
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI client with API key
client = openai.OpenAI(api_key=api_key)

# Input CSV file
input_csv = 'ADDRESSmegalist.csv'

# Output CSV file
output_csv = 'smalloutput.csv'

# Define the batch size for embedding requests
BATCH_SIZE = 20

# Function to embed a batch of texts using OpenAI's embedding model
def embed_text_batch(texts):
    response = client.embeddings.create(
        input=texts,  # The texts to be embedded
        model="text-embedding-3-small"  # The model used for embedding
    )
    return [item.embedding for item in response.data]  # Return the embeddings

# Main function to process the CSV file and write results to a new CSV file
def main():
    start_time = time.time()  # Start benchmarking

    # Read the CSV file
    with open(input_csv, newline='') as csvfile:
        csv_reader = csv.reader(csvfile)
        csv_data = list(csv_reader)

    # Randomize the rows and select the first 1000
    random.shuffle(csv_data)
    csv_data = csv_data[:1000]

    all_embeddings = []  # List to store all embeddings
    all_data = []  # List to store CSV data along with embeddings

    addresses = [row[0] for row in csv_data]  # Extract addresses from the first column

    # Process addresses in batches
    for i in range(0, len(addresses), BATCH_SIZE):
        batch = addresses[i:i+BATCH_SIZE]  # Get the current batch of addresses
        embeddings = embed_text_batch(batch)  # Generate embeddings for the batch
        all_embeddings.extend(embeddings)  # Add embeddings to the list

    # Combine original CSV data with embeddings
    for row, embedding in zip(csv_data, all_embeddings):
        all_data.append(row + [embedding])  # Append the embedding to the row

    # Write the data to a new CSV file
    with open(output_csv, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Address", "Other Column", "Embedding"])  # Write the header (adjust columns as needed)
        for row in all_data:
            csv_writer.writerow(row)

    end_time = time.time()  # End benchmarking
    print(f"Processing time: {end_time - start_time} seconds")

if __name__ == "__main__":
    main()  # Execute the main function


Processing time: 22.43201446533203 seconds
