In [2]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [6]:
 pip install pdfplumber pytesseract

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadi

In [8]:
# Import all required libraries
import os
import requests
import json
import base64
from io import BytesIO
import pandas as pd
import docx
import pdfplumber
import pytesseract
from PIL import Image
import torch
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown
import gradio as gr
from dotenv import load_dotenv

# Together.ai API setup
load_dotenv()
MODEL_NAME = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# Initialize Together client
client = OpenAI(
    api_key=os.getenv("TOGETHER_API_KEY"),
    base_url="https://api.together.xyz/v1",
)

# File reading functions
def read_docx(file_path):
    doc = docx.Document(file_path)
    return '\n'.join([p.text for p in doc.paragraphs])

def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

def read_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path))

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def extract_data_from_file(file_path):
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    try:
        if ext == '.csv':
            return {'type': 'dataframe', 'data': pd.read_csv(file_path)}
        elif ext == '.xlsx':
            return {'type': 'dataframe', 'data': pd.read_excel(file_path)}
        elif ext == '.txt':
            return {'type': 'text', 'data': read_txt(file_path)}
        elif ext == '.pdf':
            return {'type': 'text', 'data': read_pdf(file_path)}
        elif ext == '.docx':
            return {'type': 'text', 'data': read_docx(file_path)}
        elif ext in ['.jpg', '.jpeg', '.png']:
            return {'type': 'text', 'data': read_image(file_path)}
        else:
            return {'type': 'error', 'data': f"Unsupported file type: {ext}"}
    except Exception as e:
        return {'type': 'error', 'data': f"Error reading file: {str(e)}"}

# System message setup
system_message = """You are an expert data analyst assistant which analyses data,
provides answers to questions and generates visualizations. You will be given text
data extracted from files. Analyze the data and answer user questions.
Follow these rules:
1. Be concise but precise
2. For tabular data, always show sample rows
3. Explain technical terms
4. Suggest follow-up questions"""

# Visualization function
def generate_visualization(data_description):
    response = client.images.generate(
        model="stability-ai/sdxl",
        prompt=f"Create an accurate data visualization showing: {data_description}",
        size="1024x1024",
        quality="standard",
        n=1
    )
    return response.data[0].url

# Chat function
def chat_with_agent(messages, history=None):
    if history is None:
        history = []

    full_messages = [{"role": "system", "content": system_message}]

    for human, assistant in history:
        full_messages.append({"role": "user", "content": human})
        full_messages.append({"role": "assistant", "content": assistant})

    full_messages.extend(messages)

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=full_messages,
        temperature=0.7,
        max_tokens=2000
    )

    return response.choices[0].message.content

# Gradio UI setup
def analyze_data(file, question):
    # Extract data from file
    data = extract_data_from_file(file.name)

    if data['type'] == 'error':
        return data['data']

    if data['type'] == 'dataframe':
        data_str = data['data'].head().to_string()
    else:
        data_str = data['data']

    # Ask the question
    messages = [
        {"role": "user", "content": f"Data:\n{data_str}\n\nQuestion: {question}"}
    ]

    answer = chat_with_agent(messages)

    # Check if visualization is needed
    if "visualization" in question.lower() or "graph" in question.lower():
        viz_url = generate_visualization(f"Data: {data_str}\nRequest: {question}")
        return f"{answer}\n\nVisualization: {viz_url}"
    else:
        return answer

# Create Gradio interface
interface = gr.Interface(
    fn=analyze_data,
    inputs=[
        gr.File(label="Upload your file"),
        gr.Textbox(label="Your question about the data")
    ],
    outputs=gr.Textbox(label="Analysis results"),
    title="Data Analyst Agent",
    description="Upload a file (CSV, Excel, PDF, etc.) and ask questions about the data."
)

# Launch the interface
interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1ba80c650b1b78a913.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


