In [37]:
import os
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np

import tempfile
import csv

import seaborn
import matplotlib.pyplot as plt

from langchain_groq import ChatGroq
 

True

In [38]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    api_key = input("Enter your GROQ API key: ").strip()
else:
    print("‚úÖ GROQ_API_KEY is set successfully.")

‚úÖ GROQ_API_KEY is set successfully.


In [39]:
def get_llm(model: str = "openai/gpt-oss-120b", temperature: float = 1.0) -> ChatGroq:
    """
    Returns a ChatGroq LLM instance with the given model and temperature.
    """
    return ChatGroq(
        model=model,
        temperature=temperature
    )

In [44]:
def preprocess_and_save(file_path):
     try:
          global df
          if file_path.endswith(".csv"):
               df = pd.read_csv(file_path, encoding='utf-8', na_values=['NA', 'N/A', 'missing'])
          elif file_path.endswith(".xlxs"):
               df = pd.read_excel(file_path, na_values=['NA', 'N/A', 'missing'])
          else:
               return None, "Unsupported file format."
          
           # Clean text columns
          for col in df.select_dtypes(include=['object']):
               df[col] = df[col].astype(str).replace({r'"': '""'}, regex=True)
          
          # Convert dates and numerics
          for col in df.columns:
               if 'date' in col.lower():
                    df[col] = pd.to_datetime(df[col], errors='coerce')
               elif df[col].dtype == 'object':
                    try:
                         df[col] = pd.to_numeric(df[col])
                    except:
                         pass
          return df, None
     except Exception as e:
          return None, str(e)



In [55]:
prompt = f"""
You are an expert Python data analyst assistant.

You are provided with a pandas DataFrame called `df`. 
Your job is to **generate Python code** that directly answers the user's data-related question using pandas, numpy, matplotli, seaborn built-in Python methods.

### Guidelines:
- Only return **pure Python code** (no text, no explanations).
- Always assign the **final answer** to a variable named `result`.
- **Do not modify `df` directly** ‚Äî if you need to transform or filter data, create a copy (e.g., `df_copy = df.copy()`).
- The code should be **ready to execute** in a Python environment.
- If a visualization or statistical summary is needed, generate it in code form (e.g., using `matplotlib`, `seaborn`, or `pandas` plotting functions).
- Never include print statements or comments.
- Output must be inside a **single Python code block**.
"""



In [None]:

from langchain.messages  import HumanMessage, AIMessage, SystemMessage
def QA_on_data(file_path):
    model = get_llm()
    if not GROQ_API_KEY:
        print("‚ùå Please provide your Groq API key.")
    else:
        try:
            df, err = preprocess_and_save(file_path)
            df_col_name = df.columns
            if err:
                print(f"‚ùå Error: {err}")
            else:
                print("‚úÖ File successfully processed.")
                print("\nüìä Preview of first 5 rows:")
                print(df.head())
            chat_history = [SystemMessage(prompt+f"The available columns in `df` are:{df_col_name}")]
            while True:
                query = input("Give your prompt?")
                if query.strip().lower() == exit:
                    break
                print(query)
                chat_history.append(HumanMessage(query))
                reponse = model.invoke(chat_history)
                chat_history.append(AIMessage(reponse.content))
                code_generated = reponse.content.strip("```python").strip("```")
                print("\nü§ñ Generated Code:\n")
                print(code_generated)



                local_vars = {"df": df}
                exec(code_generated, {}, local_vars)
                
                result = local_vars.get("result", "‚ö†Ô∏è No result generated.")
                print("\nüìà Final Result:\n")
                if isinstance(result, pd.DataFrame):
                    print(result.to_string(index=False))
                else:
                    print(result)
        except Exception as e:
            print(f"‚ùå Exception occurred: {e}")



In [58]:
DATA_FILE_PATH = r"D:\Agentic_AI\autonomous-data-analyst\Data_Analyst\data\datasets\insurance.csv"

In [None]:
QA_on_data(DATA_FILE_PATH)

‚úÖ File successfully processed.

üìä Preview of first 5 rows:
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
can you summary of my df

ü§ñ Generated Code:


result = df.describe(include='all')


üìà Final Result:

        age  sex         bmi    children smoker    region      charges
1338.000000 1338 1338.000000 1338.000000   1338      1338  1338.000000
        NaN    2         NaN         NaN      2         4          NaN
        NaN male         NaN         NaN     no southeast          NaN
        NaN  676         NaN         NaN   1064       364          NaN
  39.207025  NaN   30.663397    1.094918    NaN       NaN 13270.422265
  14.049960  NaN   

None
