In [4]:
from pinecone import Pinecone
import os
from dotenv import load_dotenv

load_dotenv()

pc_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)
pc_index_url = os.environ.get("PINECONE_INDEX_URL")
pc_index_name = "openclio"
pc_index = pc.Index(name=pc_index_name, host=pc_index_url) # Initializing the Pinecone index with host.

pc.describe_index("openclio") # Describing the Pinecone index, to ensure that it was loaded in correctly.

# Clear all existing vectors from the index to start fresh

print("Clearing all existing vectors from Pinecone index...")
pc_index.delete(delete_all=True)
print("All vectors cleared. Starting fresh.")

Clearing all existing vectors from Pinecone index...
All vectors cleared. Starting fresh.


In [5]:
import pandas as pd

# Load in the CSV `selected_conversations_with_topics_embedded_clustered.csv` as a dataframe.
df = pd.read_csv("selected_conversations_with_topics_embedded_clustered.csv")

# Pre-processing the entire df to replace all NaNs with empty strings.
df = df.fillna("")

# Write a basic test/log to ensure that no NaNs are present in the df.
print(df.isna().sum())

Model                               0
Conversation                        0
Language                            0
Toxic                               0
State                               0
Country                             0
Hour of Day                         0
Topic_Embedding                     0
Topic                               0
L0_cluster_id                       0
L0_cluster_label                    0
L0_cluster_description              0
L0_cluster_description_embedding    0
L1_cluster_id                       0
L1_cluster_label                    0
L1_cluster_description              0
L1_cluster_description_embedding    0
L2_cluster_id                       0
L2_cluster_label                    0
L2_cluster_description              0
L2_cluster_description_embedding    0
dtype: int64


In [6]:
## Processing L2 clusters (highest-level clusters extracted from the trace data). 
import ast
from tqdm import tqdm

# First, calculate trace counts for each L2 cluster
unique_l2_clusters = (
    df.groupby(["L2_cluster_id", "L2_cluster_label", "L2_cluster_description", "L2_cluster_description_embedding"])
    .size()
    .reset_index(name="L2_cluster_trace_count")
)

# Printing the first few unique clusters.
print(unique_l2_clusters.head())

# Initializing L2 cluster metadata before upserting.
for idx, row in tqdm(unique_l2_clusters.iterrows(), total=len(unique_l2_clusters), desc="Upserting L2 clusters"):
    l2_cluster_metadata = {
        "type": "l2_cluster",
        "name": row["L2_cluster_label"],
        "description": row["L2_cluster_description"],
        "trace_count": row["L2_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L2_cluster_description_embedding"])
    vectors = [{
        "id": f"l2_cluster_{idx}",
        "values": values,
        "metadata": l2_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors)
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L2 clusters.") # If we've reached this point, then the L2 clusters have been upserted successfully.

   L2_cluster_id                         L2_cluster_label  \
0              0  Technical Problem-Solving Methodologies   
1              1          Fictional Narrative Exploration   
2              2                    AI Capability Testing   
3              3     Technical Problem-Solving Assistance   
4              4              Vocal Technique Comparisons   

                              L2_cluster_description  \
0  This cluster represents systematic approaches ...   
1  This cluster represents user-generated content...   
2  This cluster represents comprehensive user int...   
3  This cluster represents a comprehensive collec...   
4  This cluster encompasses user inquiries that s...   

                    L2_cluster_description_embedding  L2_cluster_trace_count  
0  [-0.00941457785665989, -0.0065292189829051495,...                      54  
1  [-0.01923833228647709, 0.01746535860002041, -0...                     193  
2  [-0.00421322463080287, 0.01365547627210617, -0...       

Upserting L2 clusters: 100%|██████████| 5/5 [00:01<00:00,  4.01it/s]

Successfully upserted L2 clusters.





In [7]:
# Next, we need to upsert the L1 clusters (lower-level clusters than L2, but still pretty high).
from tqdm import tqdm

unique_l1_clusters = (
    df.groupby(["L1_cluster_id", "L1_cluster_label", "L1_cluster_description", "L1_cluster_description_embedding", "L2_cluster_id"])
    .size()
    .reset_index(name="L1_cluster_trace_count")
)
print(unique_l1_clusters.head())

for idx, row in tqdm(unique_l1_clusters.iterrows(), total=len(unique_l1_clusters), desc="Upserting L1 clusters"):
    l1_cluster_metadata = {
        "type": "l1_cluster",
        "L2_cluster_id": row["L2_cluster_id"], # Need to store the L2 cluster ID for the L1 cluster, so that this can be returned in tool calls later on.
        "name": row["L1_cluster_label"],
        "description": row["L1_cluster_description"],
        "trace_count": row["L1_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L1_cluster_description_embedding"])
    vectors = [{
        "id": f"l1_cluster_{idx}",
        "values": values,
        "metadata": l1_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors)
    print(f"Upsert response: {upsert_response}")
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L1 clusters.") # If we've reached this point, then the L1 clusters have been upserted successfully.

   L1_cluster_id                          L1_cluster_label  \
0              0  Methodological and Analytical Frameworks   
1              1             AI Assistant Boundary Testing   
2              2                   Broad Knowledge Testing   
3              3              TrueNAS Storage Optimization   
4              4     Game of Thrones speculative scenarios   

                              L1_cluster_description  \
0  This cluster encompasses discussions centered ...   
1  This cluster represents diverse user interacti...   
2  This cluster represents user interactions that...   
3  This cluster encompasses comprehensive discuss...   
4  This cluster encompasses user-generated conten...   

                    L1_cluster_description_embedding  L2_cluster_id  \
0  [-0.0216191615909338, 0.013525794260203838, -0...              0   
1  [-0.008321586064994335, -0.0033742745872586966...              2   
2  [-0.0021743522956967354, 0.019716961309313774,...              2   
3  [-0

Upserting L1 clusters:   0%|          | 0/25 [00:00<?, ?it/s]

Upsert response: {'upserted_count': 1}


Upserting L1 clusters:   8%|▊         | 2/25 [00:00<00:02,  8.56it/s]

Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  16%|█▌        | 4/25 [00:00<00:01, 10.57it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  24%|██▍       | 6/25 [00:00<00:01, 11.49it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  32%|███▏      | 8/25 [00:00<00:01, 11.83it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  40%|████      | 10/25 [00:00<00:01, 11.02it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  48%|████▊     | 12/25 [00:01<00:01, 10.72it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  56%|█████▌    | 14/25 [00:01<00:01, 10.06it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  64%|██████▍   | 16/25 [00:01<00:00, 10.08it/s]

Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  72%|███████▏  | 18/25 [00:01<00:00, 10.73it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  80%|████████  | 20/25 [00:01<00:00, 10.95it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters:  88%|████████▊ | 22/25 [00:02<00:00, 11.50it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L1 clusters: 100%|██████████| 25/25 [00:02<00:00, 10.72it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Successfully upserted L1 clusters.





In [8]:
# Next, we need to upsert the L0 clusters (lowest-level clusters, which are a level above the topics/trace descriptions).
from tqdm import tqdm

unique_l0_clusters = (
    df.groupby(["L0_cluster_id", "L0_cluster_label", "L0_cluster_description", "L0_cluster_description_embedding", "L1_cluster_id", "L2_cluster_id"])
    .size()
    .reset_index(name="L0_cluster_trace_count")
)
print(unique_l0_clusters.head())

for idx, row in tqdm(unique_l0_clusters.iterrows(), total=len(unique_l0_clusters), desc="Upserting L0 clusters"):
    l0_cluster_metadata = {
        "type": "l0_cluster",
        "L1_cluster_id": row["L1_cluster_id"], # Need to store the L1 cluster ID for the L0 cluster, so that this can be returned in tool calls later on.
        "L2_cluster_id": row["L2_cluster_id"], # Need to store the L2 cluster ID for the L0 cluster, so that this can be returned in tool calls later on.
        "name": row["L0_cluster_label"],
        "description": row["L0_cluster_description"],
        "trace_count": row["L0_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L0_cluster_description_embedding"])
    vectors = [{
        "id": f"l0_cluster_{idx}",
        "values": values,
        "metadata": l0_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors) 
    print(f"Upsert response: {upsert_response}")
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L0 clusters.") # If we've reached this point, then the L0 clusters have been upserted successfully.

   L0_cluster_id              L0_cluster_label  \
0              0  Programming Error Resolution   
1              1     AI Assistant Interactions   
2              2  Fictional Character Reunions   
3              3  Diverse Information Requests   
4              4  Diverse Information Requests   

                              L0_cluster_description  \
0  This cluster represents a collection of progra...   
1  This cluster represents diverse user interacti...   
2  This cluster represents requests for creative ...   
3  This cluster represents a wide range of user q...   
4  This cluster represents a wide range of user q...   

                    L0_cluster_description_embedding  L1_cluster_id  \
0  [-0.001186206005513668, 0.008096595294773579, ...             10   
1  [-0.0019425011705607176, 0.006542277056723833,...              1   
2  [-0.02054133079946041, 0.017430299893021584, -...              8   
3  [-0.0035605276934802532, 0.011465399526059628,...              2   
4  [0.0

Upserting L0 clusters:   4%|▍         | 2/50 [00:00<00:04, 10.81it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:   8%|▊         | 4/50 [00:00<00:07,  6.55it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  12%|█▏        | 6/50 [00:00<00:06,  6.94it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  16%|█▌        | 8/50 [00:01<00:06,  6.89it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  20%|██        | 10/50 [00:01<00:05,  7.81it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  24%|██▍       | 12/50 [00:01<00:04,  8.72it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  28%|██▊       | 14/50 [00:01<00:03,  9.01it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  32%|███▏      | 16/50 [00:01<00:03,  9.35it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  36%|███▌      | 18/50 [00:02<00:03,  8.81it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  42%|████▏     | 21/50 [00:02<00:03,  7.32it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  46%|████▌     | 23/50 [00:02<00:03,  7.78it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  50%|█████     | 25/50 [00:03<00:03,  8.31it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  54%|█████▍    | 27/50 [00:03<00:02,  8.15it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  58%|█████▊    | 29/50 [00:03<00:02,  8.44it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  62%|██████▏   | 31/50 [00:03<00:02,  8.63it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  66%|██████▌   | 33/50 [00:04<00:02,  8.47it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  70%|███████   | 35/50 [00:04<00:01,  8.68it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  76%|███████▌  | 38/50 [00:04<00:01,  9.73it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  80%|████████  | 40/50 [00:04<00:00, 10.61it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  86%|████████▌ | 43/50 [00:05<00:00,  8.71it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  90%|█████████ | 45/50 [00:05<00:00,  9.57it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters:  98%|█████████▊| 49/50 [00:05<00:00, 10.17it/s]

Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}
Upsert response: {'upserted_count': 1}


Upserting L0 clusters: 100%|██████████| 50/50 [00:05<00:00,  8.66it/s]

Upsert response: {'upserted_count': 1}
Successfully upserted L0 clusters.





In [9]:
unique_topics = df.drop_duplicates(subset=["Topic"]).reset_index(drop=True)
print(unique_topics[["Topic"]].head())
print(len(unique_topics["Topic"]))

                                               Topic
0  The user requested a personalized reality-shif...
1  The user asked about the highest risk factor f...
2  The user asked for a personalized reality-shif...
3  The user asked for help choosing a reality-shi...
4  The user asked the assistant to write terms an...
995


In [10]:
# Extracting chunk description
import ast
all_messages = []   
for i, row in tqdm(
    unique_topics.iterrows(), total=len(unique_topics), desc="Upserting topics"
):

    # Remove 'defaultdict' wrapper if present and safely convert to dict
    raw_description = row["Conversation"]
    # For each conversation, extract the messages as a flat list: user, assistant, user, assistant, ...
    # Handle the defaultdict(<class ...>, {...}) wrapper if present

    messages = []

    if isinstance(raw_description, str) and raw_description.startswith("defaultdict"):
        # Removes 'defaultdict(<class ...>, {...})' wrapper
        start = raw_description.find("{")
        end = raw_description.rfind("}")
        if start != -1 and end != -1:
            dict_str = raw_description[start:end+1]
            try:
                conversation_dict = ast.literal_eval(dict_str)
            except Exception as e:
                print(f"Failed to parse dict from defaultdict string: {e}")
                conversation_dict = {}
        else:
            conversation_dict = {}
    # If not a defaultdict wrapper, try to eval string or use as-is
    elif isinstance(raw_description, str):
        try:
            conversation_dict = ast.literal_eval(raw_description)
        except Exception:
            conversation_dict = {}
    elif isinstance(raw_description, dict):
        conversation_dict = raw_description
    else:
        conversation_dict = {}

    # Now extract the turns as messages in order.
    # conversation_dict should be a mapping of turn_number -> {'user': ..., 'assistant': ...}
    if isinstance(conversation_dict, dict):
        for turn_id in sorted(conversation_dict.keys()):
            turn = conversation_dict[turn_id]
            if isinstance(turn, dict):
                if "user" in turn:
                    messages.append(turn["user"])
                if "assistant" in turn:
                    messages.append(turn["assistant"])

    #print(messages)
    all_messages.append(messages)


len(all_messages[182])

Upserting topics: 100%|██████████| 995/995 [00:00<00:00, 7813.24it/s]


36

In [11]:
# Extracting chunk description
import ast

for i, row in tqdm(unique_topics.iterrows(), total=len(unique_topics), desc="Upserting topics"):

    # Remove 'defaultdict' wrapper if present and safely convert to dict
    raw_description = row["Conversation"]
    if isinstance(raw_description, str) and raw_description.startswith("defaultdict"):
        # Removes 'defaultdict(<class ...>, {...})' wrapper
        start = raw_description.find("{")
        end = raw_description.rfind("}")
        if start != -1 and end != -1:
            dict_str = raw_description[start:end+1]
            try:
                description = ast.literal_eval(dict_str)
            except Exception as e:
                print(f"Failed to parse dict from defaultdict string: {e}")
                description = dict_str  # fallback to raw string if parsing fails
        else:
            description = raw_description
    else:
        description = raw_description

Upserting topics: 100%|██████████| 995/995 [00:00<00:00, 8467.43it/s]


In [12]:
import ast
import json
from tqdm import tqdm

# Helper function to calculate metadata size in bytes
def calculate_metadata_size(metadata):
    """Calculate the size of metadata when serialized to JSON."""
    return len(json.dumps(metadata).encode('utf-8'))

# Helper function to chunk trace messages to fit within size limit
def chunk_trace_messages(trace_messages, max_size_bytes, base_metadata_without_trace, description):
    """
    Split trace_messages into chunks that fit within the size limit.
    Returns a list of trace chunks, each as a list of messages.
    """
    # Helper to create complete metadata for size testing
    def create_test_metadata(trace_chunk, total_chunks, chunk_idx):
        """Create complete metadata for size calculation."""
        metadata = base_metadata_without_trace.copy()
        metadata["description"] = description
        metadata["trace"] = trace_chunk
        metadata["total_trace_chunks"] = total_chunks
        metadata["trace_chunk_idx"] = chunk_idx
        return metadata
    
    # Estimate total chunks (we'll refine this, but use conservative estimate)
    # Start by checking if full trace fits with single chunk
    test_metadata = create_test_metadata(trace_messages, 1, 0)
    full_trace_size = calculate_metadata_size(test_metadata)
    
    safety_buffer = 1000  # extra buffer for JSON encoding variations and future total_chunks changes
    
    if full_trace_size <= max_size_bytes - safety_buffer:
        return [trace_messages]  # Full trace fits, return as single chunk
    
    # Need to chunk the trace
    # We'll estimate total chunks conservatively - assume we might need many chunks
    # Use a reasonable estimate: assume average chunk size based on first few messages
    estimated_total_chunks = max(2, len(trace_messages) // 10)  # rough estimate
    
    trace_chunks = []
    current_chunk = []
    
    for message in trace_messages:
        # Test adding this message to current chunk
        test_chunk = current_chunk + [message]
        # Use current chunk index for testing
        test_metadata = create_test_metadata(test_chunk, estimated_total_chunks, len(trace_chunks))
        test_size = calculate_metadata_size(test_metadata)
        
        if test_size <= max_size_bytes - safety_buffer:
            # Message fits, add to current chunk
            current_chunk = test_chunk
        else:
            # Message doesn't fit, save current chunk and start new one
            if current_chunk:
                trace_chunks.append(current_chunk)
            current_chunk = [message]
    
    # Add final chunk
    if current_chunk:
        trace_chunks.append(current_chunk)
    
    # Final verification: recalculate with actual total_chunks
    actual_total_chunks = len(trace_chunks)
    final_chunks = []
    for idx, chunk in enumerate(trace_chunks):
        verify_metadata = create_test_metadata(chunk, actual_total_chunks, idx)
        verify_size = calculate_metadata_size(verify_metadata)
        if verify_size > max_size_bytes:
            # This shouldn't happen, but if it does, we need to split further
            print(f"Warning: Chunk {idx} still exceeds limit ({verify_size} bytes), splitting further...")
            # Split this chunk in half (simple fallback)
            mid = len(chunk) // 2
            if mid > 0:
                final_chunks.append(chunk[:mid])
                final_chunks.append(chunk[mid:])
            else:
                final_chunks.append(chunk)
        else:
            final_chunks.append(chunk)
    
    return final_chunks if final_chunks else [trace_messages]

# Actually upsert to Pinecone with the trace as a list of strings
PINECONE_METADATA_LIMIT = 40960  # 40KB limit

for i, row in tqdm(unique_topics.iterrows(), total=len(unique_topics), desc="Upserting topics"):
    # Get the trace as produced above
    trace_messages = all_messages[i]

    # Base metadata without trace (we'll add trace chunks separately)
    base_metadata_without_trace = {
        "type": "topic",
        "L0_cluster_id": row["L0_cluster_id"],
        "L1_cluster_id": row["L1_cluster_id"],
        "L2_cluster_id": row["L2_cluster_id"],
        "model": row["Model"],
        "toxic": row["Toxic"],
        "state": row["State"],
        "country": row["Country"],
        "hour": row["Hour of Day"],
    }
    
    description = row["Topic"]
    values = ast.literal_eval(row["Topic_Embedding"])
    
    # Chunk trace messages if necessary
    trace_chunks = chunk_trace_messages(
        trace_messages, 
        PINECONE_METADATA_LIMIT, 
        base_metadata_without_trace,
        description
    )
    
    total_trace_chunks = len(trace_chunks)
    
    # Create a vector for each trace chunk
    for trace_chunk_idx, trace_chunk in enumerate(trace_chunks):
        chunk_metadata = base_metadata_without_trace.copy()
        chunk_metadata["description"] = description
        chunk_metadata["trace"] = trace_chunk        
        # Use trace_chunk_idx as the chunk_idx in the vector ID
        vector_id = f"topic_{i}_{trace_chunk_idx}"
        
        vectors = [{
            "id": vector_id,
            "values": values,
            "metadata": chunk_metadata,
        }]
        
        # Verify the metadata size before upserting
        metadata_size = calculate_metadata_size(chunk_metadata)
        if metadata_size > PINECONE_METADATA_LIMIT:
            print(f"Warning: Vector {vector_id} metadata size ({metadata_size} bytes) exceeds limit!")
        
        upsert_response = pc_index.upsert(vectors=vectors)
        assert upsert_response["upserted_count"] == 1

print("Successfully upserted topics.") # If we've reached this point, then the topics have been upserted successfully.

Upserting topics: 100%|██████████| 995/995 [01:53<00:00,  8.74it/s]

Successfully upserted topics.



