In [None]:
!source ag/bin/activate

In [None]:
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import Tool, StructuredTool
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.memory import ConversationSummaryBufferMemory
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
import requests
from bs4 import BeautifulSoup
import pandas_gbq
import os
import tempfile
import fitz  # PyMuPDF
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
import logging
import pandas as pd
from typing import List

# --- Environment Setup ---
OPENAI_API_KEY = "your_openai_api_key"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
model = "gpt-4o-mini"

# --- Pydantic Models for Tool Input ---
class ReadFileRequest(BaseModel):
    file_path: str = Field(description="The path to the data file.")

class GenerateAnalysisCodeRequest(BaseModel):
    file_path: str = Field(description="The path to the data file.")
    analysis_objective: str = Field(description="The objective of the request (e.g., answer).")

class ExecuteCodeRequest(BaseModel):
    code: str = Field(description="The Python code to execute.")

class AnalyzeResultsRequest(BaseModel):
    code_output: str = Field(description="The output from executing the code.")
    plot_files: List[str] = Field(description="List of paths to the generated useful data.")

class CreateReportRequest(BaseModel):
    analysis: str = Field(description="The analysis of the results.")
    plot_files: List[str] = Field(description="List of paths to the generated data files.")

# --- Helper Functions ---
def read_data_file(file_path: str) -> str:
    """Reads a data file and returns a description."""
    try:
        if file_path.endswith(('.csv')):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file_path)
        else:
            return f"Error: Unsupported file format: {file_path}"
        return f"Successfully read data from {file_path}. DataFrame info:\n{df.info()}\n\nFirst 5 rows:\n{df.head().to_string()}"
    except Exception as e:
        return f"Error reading file: {e}"

def execute_python_code(code: str) -> tuple[str, List[str]]:
    """Executes Python code and captures output and any data file paths."""
    temp_file = None
    plot_files = []
    try:
        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp_file:
            tmp_file.write(code)
            temp_file = tmp_file.name

        import subprocess
        process = subprocess.Popen(['python', temp_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stdout, stderr = process.communicate()

        if stderr:
            return f"Error during code execution:\n{stderr}", []
        else:
            # Collect generated plot files (basic heuristic)
            for filename in os.listdir():
                if filename.startswith((".png", ".jpg", ".jpeg")):
                    plot_files.append(filename)
            return f"Code executed successfully. Output:\n{stdout}", plot_files
    except Exception as e:
        return f"Error executing code: {e}", []
    finally:
        if temp_file and os.path.exists(temp_file):
            os.remove(temp_file)

# --- Tools ---
tools = [
    Tool(
        name="read_data_file",
        func=read_data_file,
        description="Useful for reading a data file (CSV or Excel) and getting information about its columns and data types. Input should be the file path.",
        args_schema=ReadFileRequest
    ),
    StructuredTool(
        name="generate_data_analysis_code",
        func=lambda file_path, analysis_objective: ChatOpenAI(model=model).predict(
            f"Generate Python code for {analysis_objective} on the data in '{file_path}'. Include saving relevant plots to files in a directory named 'eda_plots'."
        ),
        description="Generates Python code for data analysis ops based on the file and the objective. Input should be the file path and the objective (e.g., 'data analysis').",
        args_schema=GenerateAnalysisCodeRequest
    ),
    Tool(
        name="execute_python_code",
        func=execute_python_code,
        description="Executes Python code and returns the output and paths to any generated data files. Input is the Python code.",
        args_schema=ExecuteCodeRequest
    )
]


# --- Initialize Agent ---
llm = ChatOpenAI(model=model, max_tokens=8000)
memory = ConversationSummaryBufferMemory(llm=llm, memory_key="chat_history")

# Construct the prompt with HumanMessagePromptTemplate
prompt_messages = [
    HumanMessagePromptTemplate.from_template("""
You are an expert data analyst and Python programmer specializing in high-performance computing. Your task is to:
- Generate Python code to maximize hardware capabilites, leveraging all available CPU cores, GPU acceleration (if applicable), and parallel processing for large dataset handling.
- Incorporate tools such as threading, and hardware-accelerated libraries like `NumPy`, `Pandas`, or `Dask` to ensure the code scales efficiently for large datasets.
- Leverage multiprocessing and gpu(concurrent.futures, multiprocessing, scikit-learn, pyspark, numpy arrays)  libraries to speed up workflow.

Available tools:
{tool_descriptions}

Use the following format:

Input: the input to the tool
Thought: I need to use a tool to help me answer the question.
Action: the name of the tool to use.
Action Input: the input to the tool.
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer.
Final Answer: the final answer to the original input question

Your goal is to ensure the generated code is optimized for large-scale data processing and hardware acceleration wherever applicable.

Begin!

Question: {input}
{agent_scratchpad}""")
]
prompt = ChatPromptTemplate.from_messages(prompt_messages)

# Initialize the agent
agent_executor = initialize_agent(
    llm=llm,
    tools=tools,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,  # Choose an appropriate agent type
    verbose=True,
    memory=memory,
    handle_parsing_errors="Check your output format and try again.",
    max_iterations=10000
)

# --- Example Usage ---
file_path = "/home/jovyan/Autogen/normalized_data.csv"  # Make sure this file exists in the same directory or provide the full path

prompt_text = f""" 
Objective:
Develop a comprehensive, end-to-end data analysis pipeline for a given dataset. The solution should load, clean, and process the data robustly (e.g., handling type conversion issues), and produce a detailed textual analysis.

Instructions:

    Data Ingestion & Preprocessing:
        Load the dataset from a CSV (or similar structured) file.
        Identify columns that may contain JSON-like strings and automatically convert these into structured (e.g., dictionary) objects. Ensure that non-standard formats (such as single-quoted JSON) are handled gracefully using appropriate parsing methods (e.g., ast.literal_eval or json.loads with preprocessing).
        Automatically detect and resolve common data type issues (for example, converting numeric fields safely using pd.to_numeric with error handling, parsing datetime fields, etc.), so that errors such as "string to float" conversion issues are avoided.
    Performance & Scalability:
        Utilize threading to distribute the load across all available CPU cores (24 physical, 48 logical with Hyper-Threading) where it can effectively speed up data processing tasks.
        Where applicable and beneficial (for example, heavy computations or large-scale data processing), leverage GPU acceleration (e.g., an RTX 2080 with 8GB VRAM) to improve performance.
    Deliverables:
        Provide complete, well-documented code that performs all of the above tasks.
        The output should include both the generated visualizations (charts, graphs, etc.) and a detailed textual analysis explaining the reasoning and insights derived from the data.

Task: Itendify any hidden connections visulize the data in time domian plots ect.

Hardware Available:

    CPU: 24 cores (48 logical cores with Hyper-Threading)
    GPU: RTX 2080 with 8GB VRAM
    

Additional Notes:

    The code must be written to be resilient against common data issues, such as inconsistent formatting or unexpected data types.
    Ensure that any potential errors (e.g., conversion issues) are handled gracefully.
    The solution should be modular enough to be adapted easily to different datasets with similar challenges.
    any plots generated use tight_layout() to generate dynamic sizes plots to fit your labels
    use spaCy for word similariy and sematic search when interacting or searching for message objects. 

The path of the dataset is '{file_path}'"""

response = agent_executor.run(prompt_text)
print(response)