In [27]:
import os
from multiprocessing import Pool, cpu_count
from dotenv import load_dotenv
import fitz  # PyMuPDF
import pymupdf4llm  # PyMuPDF helper for Markdown
from langchain.text_splitter import MarkdownTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI

# Load environment variables
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
if not openai_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

# Initialize ChromaDB
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-ada-002"
)
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"

# Reset the collection to avoid clutter
if collection_name in chroma_client.list_collections():
    chroma_client.delete_collection(name=collection_name)

collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef
)

client = OpenAI(api_key=openai_key)

# Helper function to extract text and tables using PyMuPDF Markdown
def extract_text_from_pdf(file_path):
    try:
        # Convert PDF to Markdown
        md_text = pymupdf4llm.to_markdown(file_path)
        return md_text
    except Exception as e:
        print(f"[PyMuPDF Markdown Error] {file_path}: {e}")
        return ""

# Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)


In [None]:
# Function to process and chunk a single document
def process_document(file_path):
    print(f"Processing: {file_path}")
    md_text = extract_text_from_pdf(file_path)
    chunks = chunk_text(md_text)
    return {"id": os.path.basename(file_path), "chunks": chunks}

# Function to generate embeddings for chunks and store them in ChromaDB
def generate_and_store_embeddings(doc):
    for i, chunk in enumerate(doc["chunks"]):
        embedding = client.embeddings.create(input=chunk, model="text-embedding-ada-002").data[0].embedding
        collection.upsert(
            ids=[f"{doc['id']}_chunk{i}"],
            documents=[chunk],
            embeddings=[embedding],
        )

# Process all PDF files with multiprocessing
def process_all_pdfs(directory_path):
    pdf_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".pdf")]
    print(f"Found {len(pdf_files)} PDF files.")

    # Parallelize document processing
    with Pool(cpu_count()) as pool:
        documents = pool.map(process_document, pdf_files)

    # Parallelize embedding generation
    with Pool(cpu_count()) as pool:
        pool.map(generate_and_store_embeddings, documents)

    print("Embeddings created for all documents!")

# Directory containing the PDFs
directory_path = "../pdfs1"
process_all_pdfs(directory_path)


Found 9 PDF files.


In [3]:
# Function to query ChromaDB for relevant chunks
def query_documents(question, n_results=5):
    results = collection.query(query_texts=[question], n_results=n_results)
    retrieved_chunks = list(set(results["documents"][0]))  # Deduplicate chunks
    return retrieved_chunks

# Function to generate a response from the retrieved chunks
def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are a financial assistant. Use the following context to answer the question. "
        "If the answer is unclear, say 'I don't know.'\n\n"
        f"Context:\n{context}\n\nQuestion: {question}"
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": question},
        ],
    )
    return response.choices[0].message.content


In [16]:
# Query example
question = "What were 3M's net sales in 2015, and how did they compare to 2014 across different business segments?"
relevant_chunks = query_documents(question, n_results=5)

answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
3M's net sales in 2015 were $30,274 million. Comparing this to 2014:
- Industrial segment: Net sales decreased from $7,686 million in 2014 to $7,578 million in 2015.
- Safety and Graphics segment: There was a decrease in net sales from $7,712 million in 2014 to $7,298 million in 2015.
- Health Care segment: Net sales decreased from $7,303 million in 2014 to $6,447 million in 2015.
- Electronics and Energy segment: Net sales were not explicitly provided in the context given.


In [17]:
# Query example
question = "What is the breakdown of 3M’s geographic sales performance in 2015, and how did the foreign currency exchange rate impact these results?"
relevant_chunks = query_documents(question, n_results=5)

answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, 3M's geographic sales performance breakdown based on organic local-currency sales growth was as follows:
- The United States saw a decline of 2.1 percent.
- Latin America/Canada experienced growth of 1.5 percent.
- Asia Pacific had growth of 0.9 percent.
- EMEA (Europe, Middle East, and Africa) had growth of 0.8 percent.

The foreign currency exchange rate negatively impacted 3M's sales performance in 2015. Foreign currency translation reduced sales by 6.8 percent, which was significant and had a noticeable effect on the overall results.


In [18]:
# Query example
question = "What were the main drivers of sales growth in the Health Care business segment, and how did the acquisitions of Ivera Medical Corp. and Treo Solutions contribute?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
The main drivers of sales growth in the Health Care business segment were organic local-currency sales growth in developing markets and sales growth in geographic regions such as Asia Pacific, Latin America/Canada, the United States, and EMEA.

The acquisitions of Ivera Medical Corp. and Treo Solutions contributed to this growth by adding to the overall sales figures. Specifically, the acquisition of Ivera Medical Corp. in March 2015 and Treo Solutions LLC in April 2014 both provided sales growth related to their respective product offerings in the healthcare industry. Additionally, the acquisitions helped in expanding 3M's presence in the healthcare market by offering new products and services to healthcare payers and providers.


In [19]:
# Query example
question = "How did restructuring charges impact operating income margins in the Industrial segment, and what were the specific initiatives driving these charges?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
Restructuring charges in the Industrial segment reduced operating income margins by 1.0 percentage points. The specific initiatives driving these charges in the Industrial segment were primarily employee-related costs amounting to $30 million and asset-related costs of $12 million, totaling $42 million.


In [20]:
# Query example
question = "What percentage of 3M’s total debt was long-term in 2015, and how did the company plan to manage its debt-to-equity ratio moving forward?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, 3M's total debt was $4.0 billion higher compared to 2014. The company's long-term debt as of December 31, 2015, was $1.8 billion. To calculate the percentage of long-term debt in 2015, you would divide the long-term debt by the total debt and multiply by 100:

Long-term debt in 2015 = $1.8 billion
Total debt in 2015 = $4.0 billion

Percentage of long-term debt in 2015 = ($1.8 billion / $4.0 billion) * 100 = 45%

As for managing its debt-to-equity ratio moving forward, 3M planned to continue its capital structure strategy, which involved purchasing its own stock and maintaining access to capital markets. This strategy was intended to help ensure that their debt levels were reasonable and in proportion to the total portfolio.


In [21]:
# Query example
question = "What were the key factors influencing 3M's pension obligations in 2015, and how did the discount rate change affect the projected benefit obligation?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, key factors influencing 3M's pension obligations included the change in the method used to estimate the service and interest cost components of the net periodic pension and other postretirement benefit costs, the discontinuation of 3M Retiree Health Care Accounts for new hires as of January 1, 2016, and the re-measurement of the plan in the third quarter of 2015 resulting in a decrease to the projected benefit obligation liability of approximately $233 million. 

The discount rate change from 3 percent to 1.5 percent per year for eligible employees also impacted the projected benefit obligation by reducing it. This reduction in the discount rate led to a decrease in the present value of the future benefit payments, resulting in a lower projected benefit obligation for 3M's pension obligations.


In [22]:
# Query example
question = "How did 3M's research and development expenditure in 2015 compare to 2014, and which segments benefited most from these investments?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
3M's research and development expenditure decreased from $1.770 billion in 2014 to $1.763 billion in 2015. Despite the decrease in overall expenditure, 3M continued to support its key growth initiatives in research and development. It appears that all of 3M's five business segments benefited from these investments as they all posted operating income margins of approximately 20 percent or more in 2014.


In [23]:
# Query example
question = "How did acquisitions and divestitures influence 3M's Electronics and Energy segment in 2015, and what were the operational impacts on income margins?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, acquisitions and divestitures influenced 3M's Electronics and Energy segment as follows:
- Acquisitions: 3M made acquisitions aligned with its strategic intent, which helped enhance its competitive advantage.
- Divestitures: 3M completed the sale of its static control business in January 2015. This divestiture could have impacted the segment's overall performance.

Operational impacts on income margins in the Electronics and Energy segment in 2015:
- Operating income in the segment decreased by 1.1 percent to $1.1 billion in 2015.
- Incremental investments related to business transformation and global ERP implementation impacted each of the five business segments' 2014 annual operating income margins by approximately 0.2 percentage points compared to 2013.

Please note that specific details or exact percentages related to the acquisitions and divestitures' influence on the segment's financial performance are not provided in the provided context.


In [24]:
# Query example
question = "What strategies did 3M employ to mitigate commodity price risks and manage raw material shortages in 2015?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)



Generated Answer:
In 2015, 3M employed several strategies to mitigate commodity price risks and manage raw material shortages. These strategies included:

1. Negotiated supply contracts: 3M negotiated contracts with suppliers to secure the necessary raw materials at stable prices, reducing the impact of fluctuating commodity prices.

2. Price protection agreements: 3M utilized price protection agreements to safeguard against sudden price increases in commodities, providing stability in its raw material costs.

3. Forward contracts: By entering into forward contracts, 3M locked in future prices for raw materials, helping to manage price risks and ensure a stable supply of materials for production.

4. Careful management of existing raw material inventories: 3M maintained efficient inventory levels of raw materials to buffer against potential shortages, ensuring uninterrupted manufacturing operations.

5. Development and qualification of additional supply sources: 3M worked towards dive

In [25]:
# Query example
question = "How did 3M’s share repurchase program affect its financial condition in 2015, and what were the long-term implications for shareholder value?"
relevant_chunks = query_documents(question, n_results=5)
answer = generate_response(question, relevant_chunks)

print("\nGenerated Answer:")
print(answer)


Generated Answer:
3M's share repurchase program in 2015 resulted in the authorization of up to $10 billion for the repurchase of its outstanding common stock. This would have decreased the number of outstanding shares, potentially leading to an increase in earnings per share. By reducing the number of shares available in the market, the repurchase could have a positive impact on the stock price and contribute to long-term shareholder value. Additionally, by utilizing excess cash to buy back shares, the company could signal confidence in its performance and generate returns for shareholders through increased stock value.
