In [None]:
# %%
import os
import nest_asyncio
import pandas as pd
import asyncio
import json
from typing import List, Sequence, Dict, Any

import openai
from toolhouse import Toolhouse
from llama_index.core.tools import FunctionTool, BaseTool
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import NodeWithScore
from llama_index.core.workflow import Workflow, step, Context, StartEvent, StopEvent
from llama_index.core import (
    ServiceContext, SimpleDirectoryReader, Document, StorageContext, Prompt, GPTVectorStoreIndex,
    VectorStoreIndex, SummaryIndex
)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.types import ChatMessage, MessageRole
from pydantic import BaseModel, Field

from pinecone import Pinecone, ServerlessSpec

from llama_parse import LlamaParse

from IPython.display import Markdown, display

from llama_index.agent.openai import OpenAIAgent

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings

# Apply nest_asyncio to allow nested event loops (useful in Jupyter notebooks)
nest_asyncio.apply()

In [None]:
from pinecone import ServerlessSpec

from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.tools import QueryEngineTool
from dotenv import load_dotenv
load_dotenv()

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
toolhouse_api_key = os.getenv("TOOLHOUSE_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

In [4]:
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)


In [5]:
system_prompt_third_agent = (
    "You are an AI assistant that helps augment car datasets by suggesting additional columns and filling them out based on the context provided. "
    "Given the following car dataset, suggest 3 new columns relevant to the type of data and generate plausible values (e.g. Car body type, horsepower, country of manufacturer, is the vehicle a luxury vehicle (Yes/No), etc.)"
    "Only choose new columns to add that you can provide truthful values for grounded in the context and your general knowledge."
    "Avoid adding columns where the rows would contain many None or NaN values."
    "For each row, return the suggestions and values in JSON format, with the column names as keys and lists of values corresponding to each row."
    "Ensure that the number of rows in the newly produced column matches the number of rows in the existing columns." 
)

In [6]:
# Create the third agent
third_agent = OpenAIAgent.from_tools(
    llm=Settings.llm,
    verbose=True,
    system_prompt=system_prompt_third_agent,
    memory=None
)

In [None]:
# %%
dataset_with_nulls = pd.read_csv('completed_car_data2.csv')
print("Dataset with null values:")
print(dataset_with_nulls.head())

In [8]:
def suggest_and_fill_columns_by_row(agent, dataset: pd.DataFrame):
    # Initialize dictionaries to hold new column data
    new_column_data = {}

    # Iterate through each row of the dataset
    for index, row in dataset.iterrows():
        # Convert the row to a dictionary
        row_str = row.to_json()
        
        # Build the query string for this row
        query_str = (
            f"{system_prompt_third_agent}\n"
            f"Given the following car dataset row:\n{row_str}\n"
            "Please suggest new columns and generate plausible values for this row. "
            "Return only the new columns and values in JSON format."
        )
        
        agent.reset()  # Reset the agent to avoid residual memory effects
        
        try:
            # Send the query to the agent and get the result
            result = agent.chat(query_str).response
            
            # Debugging the raw response
            print(f"Raw agent response for row {index}: {result}")
            
            # Clean the result by removing the ```json wrapper
            if result.startswith("```json"):
                result = result.strip("```json").strip("```")
            
            # Parse the result as JSON
            row_new_columns = json.loads(result)
            
            # Append the new column values to the corresponding row
            for column, value in row_new_columns.items():
                if column not in new_column_data:
                    new_column_data[column] = [None] * len(dataset)  # Initialize with None (NaN) for all rows
                
                # Extract the first element from the list, assuming it's a single value
                if isinstance(value, list) and len(value) == 1:
                    new_column_data[column][index] = value[0]  # Use the first element from the list
                else:
                    new_column_data[column][index] = value  # In case it's not a list, just use the value
        
        except json.JSONDecodeError as e:
            print(f"Error parsing the agent response for row {index}: {e}")
        
        except Exception as e:
            print(f"An unexpected error occurred for row {index}: {e}")
    
    # Add the new columns to the dataset
    for column, values in new_column_data.items():
        dataset[column] = values
    
    return dataset

In [None]:
completed_dataset_with_additional_columns = suggest_and_fill_columns_by_row(third_agent, dataset_with_nulls)


In [None]:
completed_dataset_with_additional_columns.to_csv('completed_car_data_with_additional_columns.csv', index=False)
print("Completed dataset with additional columns:")
print(completed_dataset_with_additional_columns.head())