In [2]:
from together import Together
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pdfplumber
import pytesseract
from PIL import Image
import docx
import io
import base64
import os

In [12]:
# Initialize Together API client
together = Together(api_key=os.getenv(Together_API_Key)
MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

In [13]:
def extract_text_from_txt(file):
    return file.read().decode()

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_text_from_image(file):
    image = Image.open(file)
    return pytesseract.image_to_string(image)

def load_tabular_file(file):
    if file.name.endswith(".csv"):
        return pd.read_csv(file)
    elif file.name.endswith(".xlsx"):
        return pd.read_excel(file)
    else:
        raise ValueError("Unsupported file type")



In [17]:

def ask_llm(prompt):
    response = together.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [18]:
def handle_text_qa(text, question):
    prompt = f"The following is a document:\n{text}\n\nNow answer this question: {question}"
    return ask_llm(prompt)

def handle_dataframe_qa(df, question):
    sample = df.head(10).to_markdown()
    prompt = f"The following is a sample of a dataset:\n{sample}\n\nNow answer this question: {question}"
    return ask_llm(prompt)


In [19]:
def generate_visual(df, command):
    if "bar" in command:
        col = command.split("bar chart of ")[-1].strip()
        df[col].value_counts().plot(kind='bar')
        plt.title(f'Bar chart of {col}')
    elif "hist" in command or "histogram" in command:
        col = command.split("histogram of ")[-1].strip()
        df[col].hist()
        plt.title(f'Histogram of {col}')
    elif "scatter" in command:
        parts = command.split("scatter plot of ")[-1].split(" vs ")
        df.plot(kind='scatter', x=parts[0].strip(), y=parts[1].strip())
        plt.title(f'Scatter plot of {parts[0]} vs {parts[1]}')
    else:
        print("Command not understood.")
        return
    plt.xlabel("")
    plt.ylabel("")
    plt.show()


In [21]:
# Choose the file to test
test_file_path = "cloudappExampleWordTemplate.jpg"  # Change to other files for different formats

with open(test_file_path, "rb") as f:
    file_name = os.path.basename(f.name)
    
    if file_name.endswith(".txt"):
        text = extract_text_from_txt(f)
        print("TEXT Extracted:\n", text[:500])
        print("LLM Response:\n", handle_text_qa(text, "Summarize this document"))

    elif file_name.endswith(".docx") or file_name.endswith(".doc"):
        text = extract_text_from_docx(f)
        print("DOCX Extracted:\n", text[:500])
        print("LLM Response:\n", handle_text_qa(text, "Give key insights from this document"))

    elif file_name.endswith(".pdf"):
        text = extract_text_from_pdf(f)
        print("PDF Extracted:\n", text[:500])
        print("LLM Response:\n", handle_text_qa(text, "Summarize this PDF"))

    elif file_name.endswith(".csv") or file_name.endswith(".xlsx"):
        df = load_tabular_file(f)
        print("DATAFRAME HEAD:\n", df.head())
        print("LLM Response:\n", handle_dataframe_qa(df, "What are the top trends in this dataset?"))
        generate_visual(df, "bar chart of Age")  # Change command as needed

    elif file_name.endswith(".jpg") or file_name.endswith(".png") or file_name.endswith(".jpeg"):
        text = extract_text_from_image(f)
        print("IMAGE TEXT:\n", text)
        print("LLM Response:\n", handle_text_qa(text, "What is written in this image?"))

    else:
        print("Unsupported file type")

IMAGE TEXT:
 ao

ra
ne

ge te One | Be | con Tet Symbol
Sales Invoice
tov ir: siicoumbre submited Date: Submittedste»
To Invoice Details
Call stats PO ate CustomerPODste>
Phone number: «Phonetumbers Po number: ccustomerPoNumber>
Point of Contact sPintafConta> PO Amount: SustomereOkmount»
Payment ers: Payment Terms
Invoice items:
a
Product Price | Quantity Total Price |
“sTablestartiProductsy«Product» | SePrice» | «Quantity» | SeTotalPriceneTableEnd:Products» |
Invoice Summary

iii ieeaica aati: | Kemetic


LLM Response:
 The document appears to be a template for a Sales Invoice. The visible text includes:

1. "Sales Invoice"
2. Various field labels:
   - "To"
   - "Invoice Details"
   - "Phone number:"
   - "Point of Contact"
   - "PO number:"
   - "PO Amount:"
   - "Payment Terms"
   - "Invoice items:" with a table that includes headers for:
     - "Product"
     - "Price"
     - "Quantity"
     - "Total Price"
3. "Invoice Summary"

The document seems to be a template with placeholder