<a href="https://colab.research.google.com/github/SAHIL9581/Agentic_model_main/blob/main/updated2_agentic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# Cell 1: Installation
!pip install -q pandas google-generativeai langchain-google-genai langchain-core langchain-experimental

In [26]:
# Cell 2: Imports & API Key Setup

import pandas as pd
import io
import getpass
import os
from IPython.display import display
from google.colab import files

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.tools import tool

import warnings
warnings.filterwarnings("ignore")

# --- Authentication ---
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [27]:
# Cell 3: Tool Definition (Corrected)

# This global dictionary will hold the content of uploaded files.
UPLOADED_FILES_CONTENT = {}
# This global variable will hold the active DataFrame for our new tool.
ACTIVE_DF = None

# --- Tool 1: For the original agentic search ---
class FileInspectionArgs(BaseModel):
    """Input schema for the file_inspector tool."""
    filename: str = Field(description="The name of the file to inspect from the list of uploaded files.")

@tool(args_schema=FileInspectionArgs)
def file_inspector(filename: str) -> str:
    """  <--- THIS DOCSTRING WAS MISSING
    Reads and returns the first 5 lines of a specified uploaded file (CSV or TXT).
    This helps in understanding the file's structure and content, especially for identifying columns.
    """
    if filename not in UPLOADED_FILES_CONTENT: return f"Error: File '{filename}' not found."
    content_bytes = UPLOADED_FILES_CONTENT[filename]
    try: content_str = content_bytes.decode('utf-8')
    except UnicodeDecodeError: content_str = content_bytes.decode('latin1')
    try:
        df = pd.read_csv(io.StringIO(content_str), on_bad_lines='skip', sep=None, engine='python', nrows=5)
        return f"Successfully read the first 5 rows of '{filename}':\n\n{df.to_string()}"
    except Exception as e:
        first_lines = "\n".join(content_str.splitlines()[:5])
        return f"Could not parse '{filename}' as a CSV, but here are the first 5 lines:\n\n{first_lines}"

# --- Tool 2: The NEW Code Interpreter for deep analysis ---
class CodeInterpreterArgs(BaseModel):
    """Input schema for the python_data_analyst tool."""
    pandas_command: str = Field(description="A single-line Python command using the 'df' variable to query the loaded pandas DataFrame.")

@tool(args_schema=CodeInterpreterArgs)
def python_data_analyst(pandas_command: str) -> str:
    """
    Executes a pandas command on the loaded DataFrame 'df' and returns the result.
    Use this for any questions that require filtering, calculating, or specific data lookups.
    Example: To find unique operators, the command would be "df['Operator'].unique()"
    """
    global ACTIVE_DF
    if ACTIVE_DF is None:
        return "Error: No DataFrame is currently loaded. Please load a file first."

    df = ACTIVE_DF # Use the globally loaded DataFrame

    try:
        print(f"⚙️ Executing command: {pandas_command}")
        local_scope = {}
        exec(f"result = {pandas_command}", {'df': df}, local_scope)
        result = local_scope.get('result', "Command executed, but no result was returned.")
        return f"Command execution successful. Result:\n{str(result)}"
    except Exception as e:
        return f"Error executing command: {e}. Please check your pandas command syntax."

In [28]:
# Cell 4: LLM Initialization

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    temperature=0, # Low temperature for more predictable, factual analysis
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [30]:
UPLOADED_FILES_CONTENT = {}  # Global variable to store uploaded file contents

# Define a tool to inspect file content
@tool
def file_inspector(filename: str) -> str:
    """Inspects the first 10 lines of a file to determine its content."""
    global UPLOADED_FILES_CONTENT
    if filename not in UPLOADED_FILES_CONTENT:
        return f"File '{filename}' not found."
    try:
        content_bytes = UPLOADED_FILES_CONTENT[filename]
        content_str = content_bytes.decode('utf-8', errors='ignore')
        return "\n".join(content_str.splitlines()[:10])
    except Exception as e:
        return f"Error reading file '{filename}': {e}"

# Cell 5: Core Application Functions

def chat_with_dataframe(df: pd.DataFrame, filename: str):
    """Handles the interactive Q&A for a single, chosen dataframe."""
    print("\n" + "="*50)
    print(f"🔬 Now analyzing '{filename}' in detail.")
    print(f"Shape of the data: {df.shape}")
    print("\nFirst 5 rows:")
    display(df.head())

    max_rows_to_send = min(100, len(df))
    data_sample = df.head(max_rows_to_send).to_string() # index=True can be helpful

    print("\nYou can now ask detailed questions about this specific dataset.")
    print("Type 'back' to return to the main menu.")

    while True:
        question = input(f"\n💬 Enter your question about '{filename}' (or 'back'): ")
        if question.lower() == 'back':
            break

        prompt = f"""
        You are a helpful data analyst. Be concise but thorough.

        You are analyzing the file named '{filename}'.
        Here is a sample of the data (first {max_rows_to_send} rows):

        {data_sample}

        Question: {question}

        Provide your analysis based on this data. If the question can't be answered
        from the data, explain why and suggest what additional data would be needed.
        """
        print("\n🔍 Analyzing...")
        response = llm.invoke(prompt)
        print("\n🤖 Analysis Results:\n")
        print(response.content)

def analyze_folder():
    """
    Handles uploading a folder of files, using an LLM agent to identify relevant
    files, and then allowing the user to select one for detailed analysis.
    """
    global UPLOADED_FILES_CONTENT
    print("\nPlease upload all your CSV and/or TXT files:")
    uploaded = files.upload()

    if not uploaded:
        print("No files were uploaded. Returning to main menu.")
        return

    UPLOADED_FILES_CONTENT = uploaded
    filenames = list(uploaded.keys())
    print(f"\n✅ Successfully uploaded {len(filenames)} files: {', '.join(filenames)}")

    print("\n🤖 Asking AI agent to identify files with map data (lat, long, depth)...")

    llm_with_tools = llm.bind_tools([file_inspector])

    prompt = f"""
    You are a data analyst agent. Your task is to identify which of the following files are most likely to contain geographical map data.
    You are specifically looking for columns that contain latitude, longitude, lat, long, or depth information.

    To do this, you MUST use the `file_inspector` tool to examine the first few lines of each file.
    After inspecting the files, state which file or files are the most relevant and explain your reasoning.

    Here are the available files: {filenames}
    """

    agent_response = llm_with_tools.invoke(prompt)

    print("\n🤖 Agent's Recommendation:\n")
    print(agent_response.content)

    print("\n" + "="*50)
    print("📋 Previews of all uploaded files (first 5 lines):")
    all_dfs = {}
    for filename in filenames:
        print(f"\n--- {filename} ---")
        try:
            content_bytes = UPLOADED_FILES_CONTENT[filename]
            try:
                df = pd.read_csv(io.StringIO(content_bytes.decode('utf-8')), on_bad_lines='skip', sep=None, engine='python')
            except UnicodeDecodeError:
                df = pd.read_csv(io.StringIO(content_bytes.decode('latin1')), on_bad_lines='skip', sep=None, engine='python')

            all_dfs[filename] = df
            display(df.head())
        except Exception as e:
            all_dfs[filename] = None
            print(f"Could not display '{filename}' as a table. Error: {e}")
            print("Showing raw text instead:")
            print(content_bytes[:200].decode('utf-8', errors='ignore') + "...")
    print("\n" + "="*50)

    while True:
        choice = input("\nEnter the name of the file you want to analyze in detail (or 'back'): ").strip()
        if choice.lower() == 'back':
            break
        if choice in all_dfs and all_dfs[choice] is not None:
            chat_with_dataframe(all_dfs[choice], choice)
            break
        elif choice in all_dfs and all_dfs[choice] is None:
            print(f"❌ Cannot analyze '{choice}' as it could not be parsed into a table.")
        else:
            print("❌ Invalid filename. Please enter one of the uploaded file names.")

def analyze_multiple_files():
    """
    Allows the user to upload multiple files and ask questions that can be
    answered by combining information from all uploaded files.
    """
    global UPLOADED_FILES_CONTENT
    print("\nPlease upload all your CSV and/or TXT files:")
    uploaded = files.upload()

    if not uploaded:
        print("No files were uploaded. Returning to main menu.")
        return

    UPLOADED_FILES_CONTENT = uploaded
    filenames = list(uploaded.keys())
    print(f"\n✅ Successfully uploaded {len(filenames)} files: {', '.join(filenames)}")

    all_dfs = {}
    for filename in filenames:
        try:
            content_bytes = UPLOADED_FILES_CONTENT[filename]
            try:
                df = pd.read_csv(io.StringIO(content_bytes.decode('utf-8')), on_bad_lines='skip', sep=None, engine='python')
            except UnicodeDecodeError:
                df = pd.read_csv(io.StringIO(content_bytes.decode('latin1')), on_bad_lines='skip', sep=None, engine='python')
            all_dfs[filename] = df
        except Exception as e:
            all_dfs[filename] = None
            print(f"Could not parse '{filename}'. Error: {e}")

    print("\nYou can now ask questions that can be answered by combining data from all uploaded files.")
    print("Type 'back' to return to the main menu.")

    while True:
        question = input("\n💬 Enter your question (or 'back'): ").strip()
        if question.lower() == 'back':
            break

        # Construct the prompt with information about all files
        prompt = "You are a data analyst.  You have access to the following dataframes:\n\n"
        for filename, df in all_dfs.items():
            if df is not None:
                prompt += f"--- {filename} ---\n"
                prompt += df.head(5).to_string() + "\n\n"
            else:
                prompt += f"--- {filename} --- (Could not be parsed as a dataframe)\n\n"

        prompt += f"Question: {question}\n"
        prompt += "Provide a concise and thorough answer based on the available data. If the question cannot be answered, explain why."

        print("\n🔍 Analyzing...")
        response = llm.invoke(prompt)
        print("\n🤖 Analysis Results:\n")
        print(response.content)

def main():
    """The main entry point of the application."""
    while True:
        print("\n===== Gemini AI Assistant =====")
        print("1. Analyze a Folder of Data Files (CSV/TXT)")
        print("2. Analyze Multiple Files and Combine Data")
        print("3. General Chat with Gemini")
        print("4. Exit")

        choice = input("Select an option (1-4): ").strip()

        if choice == "1":
            analyze_folder()
        elif choice == "2":
            analyze_multiple_files()
        elif choice == "3":
            question = input("\n💬 Enter your question: ").strip()
            if not question: continue
            print("\n🤖 Thinking...")
            response = llm.invoke(question)
            print("\nResponse:\n")
            print(response.content)
        elif choice == "4":
            print("\nGoodbye!")
            break
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    # This is needed for the files.upload() function to work in environments like Google Colab
    try:
        from google.colab import files
    except ImportError:
        print("Not running in Google Colab.  files.upload() will not work.")
        files = None  # Set files to None to avoid errors if not in Colab

    main()


===== Gemini AI Assistant =====
1. Analyze a Folder of Data Files (CSV/TXT)
2. Analyze Multiple Files and Combine Data
3. General Chat with Gemini
4. Exit
Select an option (1-4): 2

Please upload all your CSV and/or TXT files:


Saving Well data.CSV to Well data (2).CSV
Saving Formation tops.TXT to Formation tops (2).TXT

✅ Successfully uploaded 2 files: Well data (2).CSV, Formation tops (2).TXT

You can now ask questions that can be answered by combining data from all uploaded files.
Type 'back' to return to the main menu.

💬 Enter your question (or 'back'): what is this data about

🔍 Analyzing...

🤖 Analysis Results:

The data consists of two files related to oil and gas wells.

* **`Well data (2).CSV`**: This file contains information about individual wells, including well identifiers (UWI, Well Number), location data (latitude, longitude, coordinates), operational details (operator, well status, completion date, abandonment date), production data (cumulative oil, gas, water), and formation information (formation at total depth, producing formation).

* **`Formation tops (2).TXT`**: This file appears to contain formation top data for at least one well (UWI 42001004370000).  It lists formation names, a sourc

Saving Well data.CSV to Well data (3).CSV
Saving Formation tops.TXT to Formation tops (3).TXT

✅ Successfully uploaded 2 files: Well data (3).CSV, Formation tops (3).TXT

🤖 Asking AI agent to identify files with map data (lat, long, depth)...

🤖 Agent's Recommendation:



📋 Previews of all uploaded files (first 5 lines):

--- Well data (3).CSV ---


Unnamed: 0,WSN,UWI (APINum),Well Number,Well Name,Well Label,Sym Code,Operator,Hist Oper,Lease Name,Lease Nbr,...,WELL\r CUMWTR,WELL\r WHIPSTOCK,WELL\r COMP_DATE,WELL\r SPUD_DATE,WELL\r PERMIT_DATE,WELL\r ABAND_DATE,WELL\r REPORT_DATE,WELL\r RPT_DATE,WELL\r LAST_ACT_DATE,WELL\r TD_DATE
0,2492,42001004370000,6-B,,42001000000000.0,PLUGGAS,YOUNG MARSHALL R DRL,ROESER & PENDLETON,L C BILLETT,6-B,...,,,11/16/1943,08/09/1943,07/30/1943,11/08/1961,04/01/1977,,06/30/2022,
1,2493,42001004380000,5-B,,42001000000000.0,TA-OIL,ROESER & PENDLETON,ROESER & PENDLETON,BILLETT L R,5-B,...,,,08/25/1936,08/04/1936,07/25/1936,,12/29/1992,,06/30/2022,
2,2706,42001300900000,SWD-1,,42001300000000.0,SWDOP,ENR OPERATING LLC,FAULCONER VERNON E,WATHEN BEN H,SWD-1,...,,,10/17/1969,10/10/1969,09/22/1969,,11/11/2009,,11/02/2023,
3,2712,42001300950000,1,,42001300000000.0,DRY,SPENCE RALPH,SPENCE RALPH,EZEM G SCARBOROUGH,1,...,,,11/27/1969,10/24/1969,10/14/1969,,12/01/1969,,04/14/2021,
4,2741,42001301360000,1,,42001300000000.0,DRY,FARISH W S,FARISH W S,O L ELLIS,1,...,,,02/15/1970,02/05/1970,01/26/1970,,03/01/1970,,02/24/2015,



--- Formation tops (3).TXT ---


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,<-----UWI(API)----->,<----------FM,NAME----------->,<-SOURCE->,<---MD,VALUE-->
42001004370000,,,,,,,605EGFDK,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,3930.0
42001004370000,,,,,,,602CRCSL,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,4653.0
42001004370000,,,,,,,602ADSGU,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,6748.0
42001004370000,,,,,,,602ADSGL,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,7450.0
42001004370000,,,,,,,602GLRSU,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,6128.0




Enter the name of the file you want to analyze in detail (or 'back'): Well data (3).CSV 

🔬 Now analyzing 'Well data (3).CSV' in detail.
Shape of the data: (15381, 47)

First 5 rows:


Unnamed: 0,WSN,UWI (APINum),Well Number,Well Name,Well Label,Sym Code,Operator,Hist Oper,Lease Name,Lease Nbr,...,WELL\r CUMWTR,WELL\r WHIPSTOCK,WELL\r COMP_DATE,WELL\r SPUD_DATE,WELL\r PERMIT_DATE,WELL\r ABAND_DATE,WELL\r REPORT_DATE,WELL\r RPT_DATE,WELL\r LAST_ACT_DATE,WELL\r TD_DATE
0,2492,42001004370000,6-B,,42001000000000.0,PLUGGAS,YOUNG MARSHALL R DRL,ROESER & PENDLETON,L C BILLETT,6-B,...,,,11/16/1943,08/09/1943,07/30/1943,11/08/1961,04/01/1977,,06/30/2022,
1,2493,42001004380000,5-B,,42001000000000.0,TA-OIL,ROESER & PENDLETON,ROESER & PENDLETON,BILLETT L R,5-B,...,,,08/25/1936,08/04/1936,07/25/1936,,12/29/1992,,06/30/2022,
2,2706,42001300900000,SWD-1,,42001300000000.0,SWDOP,ENR OPERATING LLC,FAULCONER VERNON E,WATHEN BEN H,SWD-1,...,,,10/17/1969,10/10/1969,09/22/1969,,11/11/2009,,11/02/2023,
3,2712,42001300950000,1,,42001300000000.0,DRY,SPENCE RALPH,SPENCE RALPH,EZEM G SCARBOROUGH,1,...,,,11/27/1969,10/24/1969,10/14/1969,,12/01/1969,,04/14/2021,
4,2741,42001301360000,1,,42001300000000.0,DRY,FARISH W S,FARISH W S,O L ELLIS,1,...,,,02/15/1970,02/05/1970,01/26/1970,,03/01/1970,,02/24/2015,



You can now ask detailed questions about this specific dataset.
Type 'back' to return to the main menu.

💬 Enter your question about 'Well data (3).CSV' (or 'back'): what is this data about

🔍 Analyzing...

🤖 Analysis Results:

This dataset contains information on oil and gas wells, primarily located in Anderson County, Texas.  The data includes well identification numbers (UWI, Well Number), location coordinates (surface and bottom),  operator information, lease details, formation data (Fm at TD, Prod Fm), well status ("Well Remarks" field indicating status like ABD-GW, OIL, D&A etc.), and various dates related to the well's lifecycle (completion, spud, permit, abandonment, report dates).  The data suggests a historical perspective, encompassing wells drilled over a significant time span.  Further analysis could reveal trends in drilling activity, production, and well types across different operators and formations.

💬 Enter your question about 'Well data (3).CSV' (or 'back'): now gi