In [2]:
!pip install pandas numpy matplotlib seaborn python-magic pymupdf python-docx pillow pytesseract openpyxl plotly requests


Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: python-magic, python-docx, pytesseract, pymupdf
Successfully installed pymupdf-1.26.1 pytess

In [None]:
# Step 1: Define the environment and import necessary libraries

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mimetypes
import fitz  # PyMuPDF
import docx
from PIL import Image
import pytesseract
import io
import base64
import json
import requests
from dotenv import load_dotenv
import openpyxl
import streamlit as st
import plotly.express as px







In [None]:
# Define model ID
import os
LLAMA_MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
api_key = st.secrets["API_KEY"]
def parse_file(file_path):
    mime_type, _ = mimetypes.guess_type(file_path)

    if file_path.endswith(".csv"):
        return pd.read_csv(file_path), "dataframe"
    elif file_path.endswith(".xlsx"):
        return pd.read_excel(file_path), "dataframe"
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read(), "text"
    elif file_path.endswith(".docx"):
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs]), "text"
    elif file_path.endswith(".pdf"):
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text, "text"
    elif file_path.lower().endswith((".png", ".jpg", ".jpeg")):
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
        return text, "text"
    else:
        raise ValueError(f"Unsupported file type: {file_path}")


In [53]:
def ask_data_agent(user_question, parsed_content, content_type):
    if content_type == "dataframe":
        data_preview = parsed_content.head(10).to_markdown()
        prompt = f"""You are a smart data analyst.
Here is a preview of the dataset (first 10 rows):
{data_preview}

User question: {user_question}
Answer in detail or provide Python code to perform the task."""
    elif content_type == "text":
        preview = parsed_content[:2000]
        prompt = f"""You are a document analysis expert.
Here is some document content:
\"\"\"
{preview}
\"\"\"

User question: {user_question}
Answer in detail based on the content above."""
    else:
        raise ValueError("Unsupported content type")

    return query_llama_agent(prompt)


In [40]:
# Step 4: Define main agent interaction logic

def ask_data_agent(user_question, parsed_content, content_type):
    """
    Handle a user's question using the parsed file content and content type.

    Parameters:
    - user_question: str, natural language question from user
    - parsed_content: DataFrame or str, parsed from file
    - content_type: "dataframe" or "text"

    Returns:
    - Response from LLM agent
    """
    if content_type == "dataframe":
        # Convert a small preview of the data to string for context
        data_preview = parsed_content.head(10).to_markdown()
        prompt = f"""You are a smart data analyst.
Here is a preview of the dataset (first 10 rows):
{data_preview}

User question: {user_question}
Answer in detail or provide instructions/code to perform the task."""

    elif content_type == "text":
        preview = parsed_content[:2000]  # Truncate to first 2000 characters
        prompt = f"""You are a document analysis expert.
Here is some document content:
\"\"\"
{preview}
\"\"\"

User question: {user_question}
Answer in detail based on the content above."""

    else:
        raise ValueError("Unsupported content type")

    return query_llama_agent(prompt)


In [45]:
# Step 5: Implement LLM code execution inside Jupyter Notebook for plotting if applicable

import re
import matplotlib.pyplot as plt

def extract_and_run_code(response_text, df):
    import matplotlib.pyplot as plt
    import numpy as np

    code_blocks = re.findall(r"```python(.*?)```", response_text, re.DOTALL)
    results = []

    for i, code in enumerate(code_blocks):
        # 🧹 Clean model code: remove any 'df = pd.read_csv(...)'
        cleaned_code = re.sub(r"df\s*=\s*pd\.read_csv\(.*?\)", "", code.strip())

        output = f"\n▶️ Code Block #{i + 1}:\n{cleaned_code}\n"
        try:
            exec_globals = {"pd": pd, "plt": plt, "df": df, "np": np}
            exec(cleaned_code, exec_globals)
            fig = plt.gcf()
            plt.show(fig)
            plt.clf()
            output += "✅ Executed successfully."
        except Exception as e:
            output += f"⚠️ Execution error: {e}"

        results.append((cleaned_code, output))

    return results

In [42]:
file_path = "/content/Delinquency_prediction_dataset.xlsx"
parsed_data, data_type = parse_file(file_path)


Please upload the `Delinquency_prediction_dataset.csv` file when prompted after running the next cell.

In [47]:
response = ask_data_agent("which age group has the most income", parsed_data, data_type)
print(response)
extract_and_run_code(response, parsed_data)

🧾 Raw API Response: {
  "id": "nxsdNcj-zqrih-94ec702748689c3c",
  "object": "text.completion",
  "created": 1749763708,
  "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
  "choices": [
    {
      "index": 0,
      "text": " \n\nTo determine which age group has the most income, we need to analyze the 'Income' column in relation to the 'Age_Group' column in the given dataset. \n\nHere's a step-by-step guide to achieve this:\n1. **Group the data by 'Age_Group'**: We will use the pandas library in Python to group the data based on the 'Age_Group' column. This will allow us to perform aggregation operations on the 'Income' column for each age group.\n\n2. **Calculate the total or average income for each age group**: We can either sum up the incomes or calculate the average income for each age group. To find which age group has the \"most income,\" we will calculate the average income, as summing incomes might be biased towards age groups with more customers.\n\n3. **Identify 



<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

[('import pandas as pd\n\n# Assuming \'df\' is your DataFrame\n# Group by \'Age_Group\' and calculate the average \'Income\'\naverage_income_by_age_group = df.groupby(\'Age_Group\')[\'Income\'].mean().reset_index()\n\n# Sort the result in descending order based on \'Income\'\naverage_income_by_age_group = average_income_by_age_group.sort_values(by=\'Income\', ascending=False)\n\n# The age group with the most income is the first row after sorting\nage_group_with_most_income = average_income_by_age_group.iloc[0][\'Age_Group\']\n\nprint(f"The age group with the most income is: {age_group_with_most_income}")\nprint(average_income_by_age_group)',
  '\n▶️ Code Block #1:\nimport pandas as pd\n\n# Assuming \'df\' is your DataFrame\n# Group by \'Age_Group\' and calculate the average \'Income\'\naverage_income_by_age_group = df.groupby(\'Age_Group\')[\'Income\'].mean().reset_index()\n\n# Sort the result in descending order based on \'Income\'\naverage_income_by_age_group = average_income_by_age_

<Figure size 640x480 with 0 Axes>