In [None]:
%pip install python-dotenv

In [None]:
# %%
import os
import nest_asyncio
import pandas as pd
import asyncio
import json
from typing import List, Sequence, Dict, Any

import openai
from toolhouse import Toolhouse
from llama_index.core.tools import FunctionTool, BaseTool
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import NodeWithScore
from llama_index.core.workflow import Workflow, step, Context, StartEvent, StopEvent
from llama_index.core import (
    ServiceContext, SimpleDirectoryReader, Document, StorageContext, Prompt, GPTVectorStoreIndex,
    VectorStoreIndex, SummaryIndex
)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.types import ChatMessage, MessageRole
from pydantic import BaseModel, Field

from pinecone import Pinecone, ServerlessSpec

from llama_parse import LlamaParse

from IPython.display import Markdown, display

from llama_index.agent.openai import OpenAIAgent

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings

# Apply nest_asyncio to allow nested event loops (useful in Jupyter notebooks)
nest_asyncio.apply()

In [3]:
# %%
# Set your API keys (ensure these are correctly set in your environment)
# from config_keys import set_keys
# set_keys()

from dotenv import load_dotenv
load_dotenv()
# Retrieve API keys from environment variables
# openai_api_key = os.environ["OPENAI_API_KEY"]
# pinecone_api_key = os.environ["PINECONE_API_KEY"]
# toolhouse_api_key = os.environ["TOOLHOUSE_API_KEY"]
# tavily_api_key = os.environ["TAVILY_API_KEY"]

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
toolhouse_api_key = os.getenv("TOOLHOUSE_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# Configure OpenAI
openai.api_key = openai_api_key



In [4]:
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
from pinecone import ServerlessSpec

from llama_index.vector_stores.pinecone import PineconeVectorStore
pc = Pinecone(api_key=pinecone_api_key)
index_name = "llamaindex-ragathon-demo-index-v4700"

pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
pdf_directory = "./car_pdfs"
pdf_files = [os.path.join(pdf_directory, f) for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
parser = LlamaParse(
    result_type="markdown"  # Options: "markdown", "text"
)
file_extractor = {".pdf": parser}
pdf_documents = SimpleDirectoryReader(input_files=pdf_files, file_extractor=file_extractor).load_data()
print(f"Number of PDF documents loaded: {len(pdf_documents)}")
index = VectorStoreIndex.from_documents(pdf_documents, storage_context=storage_context)
query_engine = index.as_query_engine()

In [6]:
from llama_index.core.tools import QueryEngineTool

In [7]:
rag_tool = QueryEngineTool.from_defaults(
    query_engine,
    name="car_data_batch_1",
    description="Use this RAG engine tool to search the ingested PDFs for missing car attributes."
)

In [8]:
from typing import Dict, Any
from llama_index.tools.tavily_research import TavilyToolSpec

def web_search_tavily(col: str, context_data: Dict[str, Any]) -> str:
    """
    Function to invoke Tavily search engine and fill the missing car data for the given column.
    """
    try:
        # Create a query string that describes the missing information and car context
        query = f"Find the '{col}' of a car with the following details: {context_data}. Return only the '{col}'."

        # Initialize Tavily tool
        tavily_tool = TavilyToolSpec(api_key=tavily_api_key)

        # Use Tavily to search for the information
        search_result = tavily_tool.search(query, max_results=1)
        return search_result[0]
    
    except Exception as e:
        print(f"Error in Tavily search: {e}")
        return "Information not found."


In [9]:
web_search_tool = FunctionTool.from_defaults(
    fn=web_search_tavily,
    name="web_search",
    description=(
        "Use this tool to perform a web search to assist you in the filling of missing car attributes."
        "Requires 'col' (the column name) and 'context_data' (non-null data from the row) as parameters."
    ),
)

In [10]:
# %%
examples = [
    {
        'id': 0,
        'brand': 'MINI',
        'model': 'Cooper S Base',
        'model_year': 2007,
        'milage': 213000,
        'fuel_type': 'Gasoline',
        'engine': '172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel',
        'transmission': 'A/T',
        'ext_col': 'Yellow',
        'int_col': 'Gray',
        'accident': 'None reported',
        'price': 4200
    },
    {
        'id': 1,
        'brand': 'Lincoln',
        'model': 'LS V8',
        'model_year': 2002,
        'milage': 143250,
        'fuel_type': 'Gasoline',
        'engine': '252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel',
        'transmission': 'A/T',
        'ext_col': 'Silver',
        'int_col': 'Beige',
        'accident': 'At least 1 accident or damage reported',
        'price': 4999
    },
    {
        'id': 2,
        'brand': 'Chevrolet',
        'model': 'Silverado 2500 LT',
        'model_year': 2002,
        'milage': 136731,
        'fuel_type': 'E85 Flex Fuel',
        'engine': '320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capability',
        'transmission': 'A/T',
        'ext_col': 'Blue',
        'int_col': 'Gray',
        'accident': 'None reported',
        'price': 13900
    }
]

In [11]:
# %%
system_prompt_agent = (
    "You are an AI assistant that helps fill in missing car data in a dataset. "
    "For each missing value, analyze the row-level context provided. "
    "Determine which tool to find the missing value and invoke the necessary tool and pass the parameters as defined."
    "Provide only the requested value without additional text. "
    "Provide the answer following the format in the examples below.\n\n"
    f"Here are some examples:\n{examples}"
)

In [12]:
# %%
agent = OpenAIAgent.from_tools(
    [rag_tool, web_search_tool],
    llm=Settings.llm,
    verbose=True,
    system_prompt=system_prompt_agent,
    memory=None
    
)

In [None]:
# %%
dataset_with_nulls = pd.read_csv('null20.csv')
print("Dataset with null values:")
print(dataset_with_nulls.head())


In [14]:

def complete_null_values(agent, dataset: pd.DataFrame, demo_max=20):
    """Fill null values in specific columns by calling the agent."""
    for i, row in dataset.head(demo_max).iterrows():
        for col in dataset.columns:
            if pd.isnull(row[col]):  # If the value is null in this specific column
                # Reset the agent's chat history before each new query
                agent.reset()

                # Extract non-null values from the row as context data
                context_data = row.dropna().to_dict()

                # Construct the prompt for the agent
                prompt = (
                    f"Given the following examples: {examples}, "
                    f"and using the information available, "
                    f"provide a clear and concise {col} value for the car using the following details from the current row: {context_data}. "
                    f"Use the row context and the tools available to you to infer the correct {col}. "
                    f"Both tools require the 'col' parameter which is the following string: {col} and the 'context_data' parameter which is the following dictionary: {context_data}. "
                    f"Do not return 'I don't know' unless absolutely necessary. If information is missing, make a well-reasoned guess based on the context provided. "
                    f"Return only the {col} value and nothing else. Ensure it follows the format seen in the examples provided."
                )

                print("Full prompt:", prompt)

                # Call the agent synchronously
                result = agent.chat(prompt)

                # Fill the specific column with the response
                dataset.at[i, col] = result

                print(f"Filling column '{col}' for row {i} with: {result}")
    return dataset

In [None]:
completed_dataset = complete_null_values(agent, dataset_with_nulls)

In [None]:
completed_dataset.to_csv('completed_car_data.csv', index=False)
print("Completed dataset:")
print(completed_dataset.head())