
# Data Analyst Agent

This notebook allows you to:
- Upload various document types (.csv, .xlsx, .pdf, .docx, images)
- Extract and process data
- Answer questions using Groq API
- Create visualizations


In [1]:

# Install necessary libraries
!pip3 install pandas numpy matplotlib seaborn pdfplumber python-docx pillow pytesseract groq


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Downloading lxml-5.3.1-cp312-cp312-macosx_10_9_universal2.whl.metadata (3.7 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m260.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading pytesseract-0.3.13-py3-none-any.wh

In [2]:

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pdfplumber
from docx import Document
from PIL import Image
import pytesseract
from groq import Groq


In [3]:

# Initialize Groq client
client = Groq(api_key='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')


In [4]:

# Function to read CSV and Excel files
def read_tabular_file(file_path):
    if file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")


In [5]:

# Function to extract text from PDF
def read_pdf(file_path):
    text = ''
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text


In [6]:

# Function to extract text from DOCX
def read_docx(file_path):
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])


In [7]:

# Function to extract text from images
def read_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text


In [8]:

# Function to handle file upload
def handle_file(file_path):
    if file_path.endswith(('.csv', '.xlsx')):
        return read_tabular_file(file_path)
    elif file_path.endswith('.pdf'):
        return read_pdf(file_path)
    elif file_path.endswith('.docx'):
        return read_docx(file_path)
    elif file_path.endswith(('.png', '.jpg', '.jpeg')):
        return read_image(file_path)
    else:
        raise ValueError("Unsupported file format")


In [9]:

# Function to ask questions using Groq
def ask_question(prompt):
    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500
    )
    return response.choices[0].message.content


In [10]:

# Function to create a visualization
def create_visualization(data, plot_type, x_col=None, y_col=None):
    if isinstance(data, pd.DataFrame):
        plt.figure(figsize=(10, 6))
        if plot_type == 'bar':
            sns.barplot(x=x_col, y=y_col, data=data)
        elif plot_type == 'line':
            sns.lineplot(x=x_col, y=y_col, data=data)
        elif plot_type == 'scatter':
            sns.scatterplot(x=x_col, y=y_col, data=data)
        plt.title(f"{plot_type.capitalize()} plot of {y_col} vs {x_col}")
        plt.show()
    else:
        print("Invalid data for visualization")


In [13]:
import time

def main():
    file_path = input("Enter the file path: ")
    try:
        data = handle_file(file_path)
        if isinstance(data, pd.DataFrame):
            print("\nAvailable columns:", data.columns)
            x_col = input("Enter x column for visualization: ")
            y_col = input("Enter y column for visualization: ")
            plot_type = input("Enter plot type (bar/line/scatter): ")
            create_visualization(data, plot_type, x_col, y_col)
        else:
            query = input("\nAsk a question (or type 'exit' to quit): ")
            if query.lower() != 'exit':
                # Add a timeout to avoid hanging
                start_time = time.time()
                try:
                    response = ask_question(f"Context: {data}\nQuestion: {query}")
                    print("\nAnswer:", response)
                except Exception as e:
                    print(f"Error during LLM call: {e}")
                finally:
                    end_time = time.time()
                    print(f"\nResponse Time: {end_time - start_time:.2f} seconds")
    except Exception as e:
        print("Error:", e)

In [15]:
main()


Error: Unsupported file format
