In [1]:
import os
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv
from pypdf import PdfReader
import fitz  # PyMuPDF
import pytesseract  # OCR
from PIL import Image
import tabula  # For reading tables from PDFs
import camelot  # For structured table extraction
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI

# Load environment variables
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
if not openai_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

# Initialize ChromaDB
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-ada-002"
)
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"

# Reset the collection to avoid clutter
if collection_name in chroma_client.list_collections():
    chroma_client.delete_collection(name=collection_name)

collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef
)

client = OpenAI(api_key=openai_key)

# Helper function to extract text from tables
def extract_tables_from_pdf(file_path):
    try:
        # Extract tables using Camelot
        tables = camelot.read_pdf(file_path, pages="all", flavor="stream")
        table_text = ""
        for table in tables:
            table_text += table.df.to_string(index=False, header=False) + "\n"
        if table_text.strip():
            return table_text
    except Exception as e:
        print(f"[Camelot Error] {file_path}: {e}")
    try:
        # Extract tables using Tabula as a fallback
        tables = tabula.read_pdf(file_path, pages="all", multiple_tables=True)
        table_text = ""
        for df in tables:
            table_text += df.to_string(index=False, header=False) + "\n"
        return table_text
    except Exception as e:
        print(f"[Tabula Error] {file_path}: {e}")
    return ""

# General text extraction function
def extract_text_from_pdf(file_path):
    text = ""

    # Try extracting text using PyPDF
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() or ""
        if text.strip():
            return text
    except Exception as e:
        print(f"[PyPDF Error] {file_path}: {e}")

    # Fallback to PyMuPDF
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text()
            if text.strip():
                return text
    except Exception as e:
        print(f"[PyMuPDF Error] {file_path}: {e}")

    return text


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:
# Function to process documents
def load_and_process_document(file_path):
    print(f"Processing file: {file_path}")
    # Extract general text and table data
    text = extract_text_from_pdf(file_path)
    tables = extract_tables_from_pdf(file_path)
    combined_text = text + "\n" + tables
    return {"id": os.path.basename(file_path), "text": combined_text}

# Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

# Function to generate embeddings
def generate_and_store_embeddings(doc):
    chunks = chunk_text(doc["text"])
    embeddings = [
        {
            "id": f"{doc['id']}_chunk{i}",
            "text": chunk,
            "embedding": client.embeddings.create(input=chunk, model="text-embedding-ada-002").data[0].embedding,
        }
        for i, chunk in enumerate(chunks)
    ]
    for embedding in embeddings:
        collection.upsert(
            ids=[embedding["id"]],
            documents=[embedding["text"]],
            embeddings=[embedding["embedding"]],
        )

# Process all PDFs in parallel
directory_path = "../pdfs1"
pdf_files = [
    os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".pdf")
]

with ThreadPoolExecutor() as executor:
    documents = list(executor.map(load_and_process_document, pdf_files))

with ThreadPoolExecutor() as executor:
    executor.map(generate_and_store_embeddings, documents)

print("Embeddings created for all documents!")


Processing file: ../pdfs1\3M_2015_10K.pdf


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generat

Embeddings created for all documents!


In [3]:
def query_documents(question, n_results=5):
    results = collection.query(query_texts=[question], n_results=n_results)
    retrieved_chunks = list(set(results["documents"][0]))  # Deduplicate chunks
    return retrieved_chunks

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are a financial assistant. Use the following context to answer the question. "
        "If the answer is unclear, say 'I don't know.'\n\n"
        f"Context:\n{context}\n\nQuestion: {question}"
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": question},
        ],
    )
    return response.choices[0].message.content


In [4]:
# Query example
question = "Who is the senior vice president in Supply Chain in 3M?"
relevant_chunks = query_documents(question, n_results=5)

answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
The Senior Vice President in Supply Chain for 3M is Paul A. Keel.


In [14]:
# Query example
question = "What role is currently hold by Ashish Khandpur?"
relevant_chunks = query_documents(question, n_results=5)

answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
Ashish Khandpur currently holds the position of Senior Vice President, Research and Development, and Chief Technology Officer.


In [5]:
# Query example
question = "Where is the corporate research laboratories and division laboratories of 3M located?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
The corporate research laboratories and certain division laboratories of 3M are located in St. Paul, Minnesota.


In [6]:
# Query example
question = "How much money as net sales (in USD) did 3M make in Consumer Segments in the year 2015?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In the year 2015, 3M made net sales of $4.8 billion in the Consumer Segments.


In [15]:
# Query example
question = "How much money as operating income (in USD) did 3M make in the industrial segment in the year 2015?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
Based on the provided financial information, the operating income for the Industrial segment in 2015 is not directly specified in the context. The operating income numbers are given for the Health Care and Consumer segments in 2015, but not for the Industrial segment. I don't have the specific operating income amount for the Industrial segment in 2015.


In [8]:
# Query example
question = "How much money (in USD) does 3M expects to contribute to its global defined benefit pension and postretirement plans in 2016?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
3M expects to contribute approximately $100 million to $200 million of cash to its global defined benefit pension and postretirement plans in 2016.


In [9]:
# Query example
question = "What was the value of \"sales cost\" as a percentage of net sales in 2015?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, the cost of sales as a percentage of net sales was 50.9%.


In [10]:
# Query example
question = "What was the value of \"selling, general and administrative expenses\" as a percentage of net sales in 2015?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
The selling, general and administrative expenses (SG&A) as a percentage of net sales in 2015 was 20.4%.


In [11]:
# Query example
question = "What are the operating expenses (in terms of percentage of net sales) for 3M in the year 2015 also mention the total operating income value (in terms of percentage of net sales) during that same year. "
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
Based on the information provided, in 2015, the operating expenses for 3M were $30,274 million, and the net sales were $31,821 million. To calculate the operating expenses as a percentage of net sales:

Operating Expenses (2015): $30,274 million
Net Sales (2015): $31,821 million

Operating Expenses as a percentage of Net Sales in 2015:
= (Operating Expenses / Net Sales) x 100
= ($30,274 million / $31,821 million) x 100
= 95.09%

Regarding the total operating income value (in terms of percentage of net sales) for 2015:
We do not have a direct mention of the total operating income value for 2015 as a percentage of net sales in the provided information.


In [12]:
# Query example
question = "What was the value of effective tax rate (in terms of percentage of pre-tax income) for 3M in the year 2015? "
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)


Generated Answer:
The effective tax rate for 3M in the year 2015 was 29.1 percent.


In [13]:
# Query example
question = " What is the value of net income attributable to Noncontrolling interests (in USD) for 3M in the year 2015? "
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)


Generated Answer:
The value of net income attributable to Noncontrolling interests for 3M in the year 2015 is $8 million.
