In [1]:
# %%
import os
import nest_asyncio
import pandas as pd
import asyncio
import json
from typing import List, Sequence, Dict, Any

import openai
from toolhouse import Toolhouse
from llama_index.core.tools import FunctionTool, BaseTool
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import NodeWithScore
from llama_index.core.workflow import Workflow, step, Context, StartEvent, StopEvent
from llama_index.core import (
    ServiceContext, SimpleDirectoryReader, Document, StorageContext, Prompt, GPTVectorStoreIndex,
    VectorStoreIndex, SummaryIndex
)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.types import ChatMessage, MessageRole
from pydantic import BaseModel, Field

from pinecone import Pinecone, ServerlessSpec

from llama_parse import LlamaParse

from IPython.display import Markdown, display

from llama_index.agent.openai import OpenAIAgent

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings

# Apply nest_asyncio to allow nested event loops (useful in Jupyter notebooks)
nest_asyncio.apply()

  from tqdm.autonotebook import tqdm


In [2]:
from pinecone import ServerlessSpec

from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.tools import QueryEngineTool
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
toolhouse_api_key = os.getenv("TOOLHOUSE_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

In [4]:
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)

In [5]:
# Defining the system prompt for the agent to reconcile column values
system_prompt_agent = (
    "You are an AI assistant that helps reconcile inconsistent car data values in a dataset.\n"
    "For each value, analyze the column-level context provided.\n"
    "If different terms refer to the same entity (e.g., 'Chevy' and 'Chevrolet'), correct the value to the standard form.\n"
    "In this dataset, 'Chevy' and 'Chevrolet' refer to the same entity, so standardize all 'Chevy' values to 'Chevrolet'.\n"
    "Ensure the corrected value is consistent across all rows.\n"
    "Provide only the corrected value in JSON format without additional text.\n"
)


In [6]:
# %%
agent = OpenAIAgent.from_tools(
    llm=Settings.llm,
    verbose=True,
    system_prompt=system_prompt_agent,
    memory=None
    
)

In [7]:
def get_unique_values_by_index(dataset: pd.DataFrame, column_index: int):
    return dataset.iloc[:, column_index].unique()

In [8]:
def update_column_with_mapping(df, col_idx, mapping):
        df.iloc[:, col_idx] = df.iloc[:, col_idx].map(mapping)
        return df

In [9]:
# %%
dataset_with_nulls = pd.read_csv('car_data_with_synonyms.csv').head(20)
print("Dataset with null values:")
print(dataset_with_nulls.head())


Dataset with null values:
   id          brand              model  model_year    milage      fuel_type  \
0   0           MINI      Cooper S Base      2007.0  213000.0       Gasoline   
1   1        Lincoln              LS V8      2003.0  143250.0       Gasoline   
2   2      Chevrolet  Silverado 2500 LT      2002.0  136731.0  E85 Flex Fuel   
3   3        Genesis   G90 5.0 Ultimate      2017.0   19500.0       Gasoline   
4   4  Mercedes-Benz        Metris Base      2021.0    7388.0       Gasoline   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige 

In [16]:
import json

def enhance_values(agent, dataset: pd.DataFrame, column_index: int):
    uniq = get_unique_values_by_index(dataset, column_index)
    
    # Convert unique values into a string representation
    quote = list(map(lambda x: f"'{x}'", uniq))
    mk_str = ', '.join(quote)
    
    # Build the query string to send to the agent
    query_str = (
        f"I have a dataset of [{mk_str}]. Could you spot the same instances and build a map to remove duplicates? "
        "Return only JSON mapping, no other text."
    )
    
    # Reset the agent to avoid residual memory effects
    agent.reset()
    
    try:
        # Send the query to the agent and get the result
        result = agent.chat(query_str).response
        
        # Print the raw response to debug
        print(f"Raw agent response: {result}")
        
        # Clean the result by removing the ```json wrapper
        if result.startswith("```json"):
            result = result.strip("```json").strip("```")
        
        # Parse the result as JSON (because result is a JSON string)
        mapping = json.loads(result)
        print(f"Updating column '{column_index}' with mapping: {mapping}")
        
        # Apply the mapping to update the specified column
        dataset = update_column_with_mapping(dataset, column_index, mapping)
        
    except json.JSONDecodeError as e:
        print(f"Error parsing the agent response: {e}")
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return dataset


In [17]:
completed_dataset = enhance_values(agent, dataset_with_nulls, column_index=1)

Added user message to memory: I have a dataset of ['MINI', 'Lincoln', 'Chevrolet', 'Genesis', 'Mercedes-Benz', 'Audi', 'Ford', 'BMW', 'Tesla', 'Cadillac', 'Land', 'GMC', 'Toyota', 'Hyundai', 'Volvo', 'Volkswagen', 'Buick', 'Rivian', 'RAM', 'Hummer', 'Alfa']. Could you spot the same instances and build a map to remove duplicates? Return only JSON mapping, no other text.
Raw agent response: ```json
{
    "MINI": "MINI",
    "Lincoln": "Lincoln",
    "Chevrolet": "Chevrolet",
    "Genesis": "Genesis",
    "Mercedes-Benz": "Mercedes-Benz",
    "Audi": "Audi",
    "Ford": "Ford",
    "BMW": "BMW",
    "Tesla": "Tesla",
    "Cadillac": "Cadillac",
    "Land": "Land",
    "GMC": "GMC",
    "Toyota": "Toyota",
    "Hyundai": "Hyundai",
    "Volvo": "Volvo",
    "Volkswagen": "Volkswagen",
    "Buick": "Buick",
    "Rivian": "Rivian",
    "RAM": "RAM",
    "Hummer": "Hummer",
    "Alfa": "Alfa"
}
```
Updating column '1' with mapping: {'MINI': 'MINI', 'Lincoln': 'Lincoln', 'Chevrolet': 'Chevrole

In [18]:
completed_dataset.to_csv('completed_car_data2.csv', index=False)
print("Completed dataset:")
print(completed_dataset.head())

Completed dataset:
   id          brand              model  model_year    milage      fuel_type  \
0   0           MINI      Cooper S Base      2007.0  213000.0       Gasoline   
1   1        Lincoln              LS V8      2003.0  143250.0       Gasoline   
2   2      Chevrolet  Silverado 2500 LT      2002.0  136731.0  E85 Flex Fuel   
3   3        Genesis   G90 5.0 Ultimate      2017.0   19500.0       Gasoline   
4   4  Mercedes-Benz        Metris Base      2021.0    7388.0       Gasoline   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige   
2   