In [2]:
import pandas as pd

df = pd.read_csv("selected_conversations_with_topics_embedded_clustered.csv")
dropped_columns = df.columns[df.isna().all()].tolist()
if dropped_columns:
    print(f"Dropping columns with all NA values: {dropped_columns}")
df = df.dropna(axis=1, how="all") # Drops all columns where every entry is NaN.

Dropping columns with all NA values: ['Subcluster_Name', 'Subcluster_Description', 'Subcluster_Description_Embedding', 'Cluster', 'Conversation_Embedding', 'L2_cluster_description_embedding']


In [3]:
from openai import AsyncOpenAI
import os
import asyncio
import json
import logging
from tqdm import tqdm
from dotenv import load_dotenv

from openAI_topic_embedding import (
    BackoffConfig,
    _normalise_topics,
    _embed_batch,
    _create_client,
)
load_dotenv() # Hot reloads the environment variables.

# Setting up basic logging.
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

client = _create_client() # Creates an asynchronous OpenAI client.
concurrency = 8 # Defines the maximum number of concurrent API calls we can make to OpenAI.
batch_size = 128
model = "text-embedding-3-large"
semaphore = asyncio.Semaphore(max(1, concurrency))
config = BackoffConfig()

# We then want to take df["L2_cluster_description"] and embed it using the same logic in `openAI_topic_embedding.py`.
raw_descriptions = df["L2_cluster_description"].fillna("").astype(str).tolist() # Converting all NaN values to empty strings (normalizing data, basically, before insertion).
descriptions = [topic.strip() for topic in raw_descriptions if topic and topic.strip()] # Gets rid of NaN values and strips whitespace from non-NaN values.
print(f"Found {len(descriptions)} unique L2 cluster descriptions to embed.")
print(f"Here's the first 5: {descriptions[:5]}")

# Create batches of size `batch_size`. Number of batches is `len(descriptions) // batch_size`.
batches = [
    descriptions[i : i + batch_size]
    for i in range(0, len(descriptions), batch_size)
]

# Maps each description to its generated embedding.
embeddings: dict[str, list[float]] = {}

with tqdm(
    total=len(descriptions),
    desc="Embedding L2 cluster descriptions",
    unit="description",
    ncols=100,
    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
) as progress:
    async def process_batch(batch_idx: int, batch: list[str]) -> None:
        async with semaphore:
            try:
                vectors = await _embed_batch(client, batch, model, config)
                for description, vector in zip(batch, vectors):
                    embeddings[description] = vector
                progress.update(len(batch))
                progress.set_postfix(
                    {
                        "batch": f"{batch_idx + 1}/{len(batches)}",
                        "size": len(batch),
                    }
                )
            except Exception as exc:
                logging.error(f"Batch {batch_idx + 1} failed: {exc}")
                raise

    tasks = [
        asyncio.create_task(process_batch(idx, batch))
        for idx, batch in enumerate(batches)
    ]
    await asyncio.gather(*tasks)

print("Writing embeddings to DataFrame...")
serialised: list[str] = []
with tqdm(
    total=len(descriptions),
    desc="Serializing embeddings",
    unit="row",
    ncols=100,
    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
) as pbar:
    for description in descriptions:
        key = description.strip()
        vector = embeddings.get(key) if key else None
        serialised.append(json.dumps(vector) if vector else "null")
        pbar.update(1)

print(f"Done serializing embeddings. The first 5 are: {serialised[:5]}")

OpenAI API key loaded
Found 1000 unique L2 cluster descriptions to embed.
Here's the first 5: ['This cluster represents user-generated content focused on speculative storytelling and character development across various fictional universes. It encompasses creative explorations of alternative scenarios, character relationships, and world-building elements that diverge from established narratives. The content spans multiple genres and media formats, often involving detailed descriptions of hypothetical situations, character transformations, and immersive fictional scenarios.', "This cluster represents comprehensive user interactions focused on evaluating and leveraging AI assistant capabilities across diverse domains. Users systematically test the AI's boundaries through ethical challenges, technical queries, creative tasks, and domain-specific requests while the assistant maintains consistent operational standards. The interactions demonstrate users' exploration of the AI's knowledge br

Embedding L2 cluster descriptions:   0%|                          | 0/1000 [00:00<?, ?description/s]2025-11-05 23:05:38,006 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-11-05 23:05:38,067 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-11-05 23:05:38,083 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding L2 cluster descriptions:  13%|█▉             | 128/1000 [00:00<00:05, 168.84description/s]2025-11-05 23:05:38,190 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-11-05 23:05:38,257 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding L2 cluster descriptions:  23%|███▍           | 232/1000 [00:00<00:02, 300.16description/s]2025-11-05 23:05:38,409 INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding L2 cluster descriptions:  36%|█████▍         | 360/1000 [00:01<00:01, 45

Writing embeddings to DataFrame...


Serializing embeddings: 100%|█████████████████████████████████| 1000/1000 [00:01<00:00, 664.49row/s]

Done serializing embeddings. The first 5 are: ['[-0.01923833228647709, 0.01746535860002041, -0.038258932530879974, -0.029689554125070572, 0.0022764841560274363, 0.013631688430905342, 0.0021054076496511698, -0.00954918097704649, 0.02491496317088604, 0.01870955154299736, 0.058010492473840714, 0.01620561257004738, -0.017791958525776863, -0.028507569804787636, 0.012675215490162373, -0.009518075734376907, -0.006870277691632509, 0.01730983331799507, 0.0185695793479681, -0.028492016717791557, 0.031664710491895676, 0.010902239941060543, -0.05676630139350891, -0.005622196476906538, 0.029767315834760666, 0.01517137698829174, -0.0024456167593598366, -0.012410824187099934, -0.06208522617816925, 0.021291250362992287, 0.017232071608304977, -0.01592567004263401, 0.06364046782255173, -0.02552150748670101, -0.012488585896790028, -0.02972065843641758, 0.003481796011328697, 0.04643949866294861, 0.023561902344226837, -0.05502443015575409, 0.04239587485790253, -0.0043702274560928345, -0.0008286519441753626




In [5]:
df["L2_cluster_description_embedding"] = serialised
# Printing the first row's value for L2_cluster_description_embedding.
print(df.iloc[0]["L2_cluster_description_embedding"])
# Saving the DataFrame to a CSV file.
df.to_csv("selected_conversations_with_topics_embedded_clustered.csv", index=False) # Fixing the previously saved CSV file.
print("Done!")

[-0.01923833228647709, 0.01746535860002041, -0.038258932530879974, -0.029689554125070572, 0.0022764841560274363, 0.013631688430905342, 0.0021054076496511698, -0.00954918097704649, 0.02491496317088604, 0.01870955154299736, 0.058010492473840714, 0.01620561257004738, -0.017791958525776863, -0.028507569804787636, 0.012675215490162373, -0.009518075734376907, -0.006870277691632509, 0.01730983331799507, 0.0185695793479681, -0.028492016717791557, 0.031664710491895676, 0.010902239941060543, -0.05676630139350891, -0.005622196476906538, 0.029767315834760666, 0.01517137698829174, -0.0024456167593598366, -0.012410824187099934, -0.06208522617816925, 0.021291250362992287, 0.017232071608304977, -0.01592567004263401, 0.06364046782255173, -0.02552150748670101, -0.012488585896790028, -0.02972065843641758, 0.003481796011328697, 0.04643949866294861, 0.023561902344226837, -0.05502443015575409, 0.04239587485790253, -0.0043702274560928345, -0.0008286519441753626, 0.02188224345445633, -0.003858942072838545, -0