In [1]:
from pinecone import Pinecone
import os

pc_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)
pc_index_url = os.environ.get("PINECONE_INDEX_URL")
pc_index_name = "openclio"
pc_index = pc.Index(name=pc_index_name, host=pc_index_url) # Initializing the Pinecone index with host.

pc.describe_index("openclio") # Describing the Pinecone index, to ensure that it was loaded in correctly.

# Clear all existing vectors from the index to start fresh

print("Clearing all existing vectors from Pinecone index...")
pc_index.delete(delete_all=True)
print("All vectors cleared. Starting fresh.")

  from .autonotebook import tqdm as notebook_tqdm


Clearing all existing vectors from Pinecone index...
All vectors cleared. Starting fresh.


In [2]:
import pandas as pd

# Load in the CSV `selected_conversations_with_topics_embedded_clustered.csv` as a dataframe.
df = pd.read_csv("selected_conversations_with_topics_embedded_clustered.csv")

# Pre-processing the entire df to replace all NaNs with empty strings.
df = df.fillna("")

# Write a basic test/log to ensure that no NaNs are present in the df.
print(df.isna().sum())

Model                               0
Conversation                        0
Language                            0
Toxic                               0
State                               0
Country                             0
Hour of Day                         0
Topic_Embedding                     0
Topic                               0
L0_cluster_id                       0
L0_cluster_label                    0
L0_cluster_description              0
L0_cluster_description_embedding    0
L1_cluster_id                       0
L1_cluster_label                    0
L1_cluster_description              0
L1_cluster_description_embedding    0
L2_cluster_id                       0
L2_cluster_label                    0
L2_cluster_description              0
L2_cluster_description_embedding    0
dtype: int64


In [3]:
## Processing L2 clusters (highest-level clusters extracted from the trace data). 
import ast
from tqdm import tqdm

# First, calculate trace counts for each L2 cluster
unique_l2_clusters = (
    df.groupby(["L2_cluster_id", "L2_cluster_label", "L2_cluster_description", "L2_cluster_description_embedding"])
    .size()
    .reset_index(name="L2_cluster_trace_count")
)

# Printing the first few unique clusters.
print(unique_l2_clusters.head())

# Initializing L2 cluster metadata before upserting.
for _, row in tqdm(unique_l2_clusters.iterrows(), total=len(unique_l2_clusters), desc="Upserting L2 clusters"):
    l2_cluster_metadata = {
        "type": "l2_cluster",
        "name": row["L2_cluster_label"],
        "description": row["L2_cluster_description"],
        "trace_count": row["L2_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L2_cluster_description_embedding"])
    vectors = [{
        "id": str(row["L2_cluster_id"]),
        "values": values,
        "metadata": l2_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors)
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L2 clusters.") # If we've reached this point, then the L2 clusters have been upserted successfully.

   L2_cluster_id                         L2_cluster_label  \
0              0  Technical Problem-Solving Methodologies   
1              1          Fictional Narrative Exploration   
2              2                    AI Capability Testing   
3              3     Technical Problem-Solving Assistance   
4              4              Vocal Technique Comparisons   

                              L2_cluster_description  \
0  This cluster represents systematic approaches ...   
1  This cluster represents user-generated content...   
2  This cluster represents comprehensive user int...   
3  This cluster represents a comprehensive collec...   
4  This cluster encompasses user inquiries that s...   

                    L2_cluster_description_embedding  L2_cluster_trace_count  
0  [-0.00941457785665989, -0.0065292189829051495,...                      54  
1  [-0.01923833228647709, 0.01746535860002041, -0...                     193  
2  [-0.00421322463080287, 0.01365547627210617, -0...       

Upserting L2 clusters: 100%|██████████| 5/5 [00:00<00:00,  5.71it/s]

Successfully upserted L2 clusters.





In [None]:
# Next, we need to upsert the L1 clusters (lower-level clusters than L2, but still pretty high).
from tqdm import tqdm

unique_l1_clusters = (
    df.groupby(["L1_cluster_id", "L1_cluster_label", "L1_cluster_description", "L1_cluster_description_embedding", "L2_cluster_id"])
    .size()
    .reset_index(name="L1_cluster_trace_count")
)
print(unique_l1_clusters.head())

for _, row in tqdm(unique_l1_clusters.iterrows(), total=len(unique_l1_clusters), desc="Upserting L1 clusters"):
    l1_cluster_metadata = {
        "type": "l1_cluster",
        "L2_cluster_id": row["L2_cluster_id"], # Need to store the L2 cluster ID for the L1 cluster, so that this can be returned in tool calls later on.
        "name": row["L1_cluster_label"],
        "description": row["L1_cluster_description"],
        "trace_count": row["L1_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L1_cluster_description_embedding"])
    vectors = [{
        "id": str(row["L1_cluster_id"]),
        "values": values,
        "metadata": l1_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors)
    print(f"Upsert response: {upsert_response}")
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L1 clusters.") # If we've reached this point, then the L1 clusters have been upserted successfully.

In [None]:
# Next, we need to upsert the L0 clusters (lowest-level clusters, which are a level above the topics/trace descriptions).
from tqdm import tqdm

unique_l0_clusters = (
    df.groupby(["L0_cluster_id", "L0_cluster_label", "L0_cluster_description", "L0_cluster_description_embedding", "L1_cluster_id", "L2_cluster_id"])
    .size()
    .reset_index(name="L0_cluster_trace_count")
)
print(unique_l0_clusters.head())

for _, row in tqdm(unique_l0_clusters.iterrows(), total=len(unique_l0_clusters), desc="Upserting L0 clusters"):
    l0_cluster_metadata = {
        "type": "l0_cluster",
        "L1_cluster_id": row["L1_cluster_id"], # Need to store the L1 cluster ID for the L0 cluster, so that this can be returned in tool calls later on.
        "L2_cluster_id": row["L2_cluster_id"], # Need to store the L2 cluster ID for the L0 cluster, so that this can be returned in tool calls later on.
        "name": row["L0_cluster_label"],
        "description": row["L0_cluster_description"],
        "trace_count": row["L0_cluster_trace_count"],
    }
    values = ast.literal_eval(row["L0_cluster_description_embedding"])
    vectors = [{
        "id": str(row["L0_cluster_id"]),
        "values": values,
        "metadata": l0_cluster_metadata,
    }]
    upsert_response = pc_index.upsert(vectors=vectors) 
    print(f"Upsert response: {upsert_response}")
    assert upsert_response["upserted_count"] == 1 # Ensure that the upsert was successful.

print("Successfully upserted L0 clusters.") # If we've reached this point, then the L0 clusters have been upserted successfully.

In [8]:
# Extracting chunk description
import ast

unique_topics = df.drop_duplicates(subset=["Topic"]).reset_index(drop=True)
print(unique_topics[["Topic"]].head())

for i, row in tqdm(unique_topics.iterrows(), total=len(unique_topics), desc="Upserting topics"):

    # Remove 'defaultdict' wrapper if present and safely convert to dict
    raw_description = row["Conversation"]
    if isinstance(raw_description, str) and raw_description.startswith("defaultdict"):
        # Removes 'defaultdict(<class ...>, {...})' wrapper
        start = raw_description.find("{")
        end = raw_description.rfind("}")
        if start != -1 and end != -1:
            dict_str = raw_description[start:end+1]
            try:
                description = ast.literal_eval(dict_str)
            except Exception as e:
                print(f"Failed to parse dict from defaultdict string: {e}")
                description = dict_str  # fallback to raw string if parsing fails
        else:
            description = raw_description
    else:
        description = raw_description
    print(description)
    key = next(iter(description.keys()))
    print(description[key]['user'])
    print(description[key]['assistant'])
    break


                                               Topic
0  The user requested a personalized reality-shif...
1  The user asked about the highest risk factor f...
2  The user asked for a personalized reality-shif...
3  The user asked for help choosing a reality-shi...
4  The user asked the assistant to write terms an...


Upserting topics:   0%|          | 0/995 [00:00<?, ?it/s]

{101001: {'user': 'Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’s daughter” or “escape this weird wacky sinister place” NOT “get an artifact that literally controls reality”. Seriously, don’t make me fetch an artifact, or fetch anything. Instead, make me DO something. 2. Babes. I need pretty girls. 3. The entry. I need to get to lose consciousness in order to begin my journey in my desired reality, preferably by having it knocked out by one of the aforementioned babes. 4. Action. It needs to be cool. 5. Unconsciousness. Myself and the babes need to pass out in this place, preferably by being knocked out in some way or fai




In [None]:
import ast
import json
from tqdm import tqdm

def chunk_text(text: str, max_bytes: int = 20000) -> list[str]:
    """Split text into chunks that don't exceed max_bytes when encoded as UTF-8."""
    text_bytes = text.encode('utf-8')
    if len(text_bytes) <= max_bytes:
        return [text]
    
    chunks = []
    for i in range(0, len(text_bytes), max_bytes):
        chunk_bytes = text_bytes[i:i+max_bytes]
        chunks.append(chunk_bytes.decode('utf-8', errors='ignore'))
    
    return chunks

unique_topics = df.drop_duplicates(subset=["Topic"]).reset_index(drop=True)
print(unique_topics[["Topic"]].head())

for i, row in tqdm(unique_topics.iterrows(), total=len(unique_topics), desc="Upserting topics"):
    # Base metadata without description
    base_metadata = {
        "type": "topic",
        "trace": row["Conversation"],
        "L0_cluster_id": row["L0_cluster_id"],
        "L1_cluster_id": row["L1_cluster_id"],
        "L2_cluster_id": row["L2_cluster_id"],
        "model": row["Model"],
        "toxic": row["Toxic"],
        "state": row["State"],
        "country": row["Country"],
        "hour": row["Hour of Day"],
    }
    
    # Calculate base metadata size (without description)
    base_metadata_bytes = len(json.dumps(base_metadata).encode('utf-8'))
    
    # Determine how much space we have for description (leave some buffer). 20KB is the maximum that we want to store in one chunk on Pinecone.
    max_description_bytes = 20000 - base_metadata_bytes - 100  # 100 byte buffer
    
    # Split description into chunks
    description_chunks = chunk_text(row["Topic"], max_description_bytes)
    
    # Get the embedding (same for all chunks of the same topic)
    values = ast.literal_eval(row["Topic_Embedding"])

    # Upsert each chunk
    for chunk_idx, chunk in enumerate(description_chunks):
        chunk_metadata = base_metadata.copy()
        chunk_metadata["description"] = chunk
        
        vectors = [{
            "id": f"topic_{i}_{chunk_idx}",
            "values": values,  # Same embedding for all chunks
            "metadata": chunk_metadata,
        }]
        
        upsert_response = pc_index.upsert(vectors=vectors)
        assert upsert_response["upserted_count"] == 1

print("Successfully upserted topics.") # If we've reached this point, then the topics have been upserted successfully.