In [26]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import fitz  # PyMuPDF
import pytesseract
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import lancedb
import pyarrow as pa
import numpy as np

# Initialize the embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
print("Embedding model loaded successfully!")

# Initialize LanceDB
db = lancedb.connect("./lancedb_vectors")
schema = pa.schema([
    ("id", pa.string()),
    ("text", pa.string()),
    ("embedding", pa.list_(pa.float32(), list_size=768)),
])
if "document_chunks" in db.table_names():
    table = db.open_table("document_chunks")
else:
    table = db.create_table("document_chunks", schema=schema, mode="overwrite")
    print("Table 'document_chunks' created successfully!")

# Text extraction function
def extract_text_from_pdf(file_path):
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"[PDFPlumber Error] {file_path}: {e}")
    if not text.strip():
        try:
            text = pytesseract.image_to_string(file_path)
        except Exception as e:
            print(f"[OCR Error] {file_path}: {e}")
    return text


Embedding model loaded successfully!


In [27]:
# Chunking function with dynamic size
def chunk_text_dynamic(text, chunk_size=800, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

# Process a single document
def process_document(file_path):
    text = extract_text_from_pdf(file_path)
    if not text.strip():
        return {"id": os.path.basename(file_path), "chunks": []}
    chunks = chunk_text_dynamic(text)
    return {"id": os.path.basename(file_path), "chunks": chunks}

# Generate and store embeddings in LanceDB
def generate_embeddings_for_chunks(doc):
    chunk_texts = doc["chunks"]
    embeddings = embedding_model.encode(chunk_texts, batch_size=32)
    records = [{"id": f"{doc['id']}_chunk{i}", "text": chunk_texts[i], "embedding": embeddings[i].tolist()} for i in range(len(chunk_texts))]
    table.add(records)

# Process and embed all PDFs in a directory
def process_pdfs(directory):
    pdf_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".pdf")]
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_document, file) for file in pdf_files]
        for future in tqdm(as_completed(futures), total=len(futures)):
            doc = future.result()
            if doc["chunks"]:
                generate_embeddings_for_chunks(doc)

# Specify the directory containing PDF files
directory_path = "../pdfs1"
process_pdfs(directory_path)


100%|██████████| 9/9 [22:45<00:00, 151.68s/it]  


In [28]:
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=openai_api_key)

# Query LanceDB for relevant chunks
def query_documents(question, top_k=10):
    question_embedding = embedding_model.encode([question])[0].tolist()
    results = table.search(question_embedding, vector_column_name="embedding").limit(top_k).to_pandas()
    return results["text"].tolist()

# Deduplicate and rank chunks
def deduplicate_and_rank_chunks(query, chunks):
    unique_chunks = list(set(chunks))  # Remove duplicates
    query_embedding = embedding_model.encode([query])[0]
    chunk_embeddings = embedding_model.encode(unique_chunks)
    scores = np.dot(chunk_embeddings, query_embedding)
    ranked_indices = np.argsort(scores)[::-1]  # Rank by similarity
    return [unique_chunks[i] for i in ranked_indices]

# Generate a response using OpenAI Chat API
def generate_response(question, chunks):
    context = "\n\n".join(chunks)
    if not context.strip():
        return "No relevant context found."

    instructions = (
        "You are a highly skilled financial analyst specializing in corporate financial reports, "
        "including 10-K filings. Your role is to provide precise, concise, and well-structured answers "
        "to questions based on the provided context. Only use the context provided to answer the questions. "
        "If the context does not contain enough information, state that explicitly. Do not guess or infer beyond "
        "the context. Present your answers in complete sentences and include key details where relevant."
    )

    try:
        # Use OpenAI Chat API for generating the answer
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": instructions},
                {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"},
            ],
        )
        # Access response fields properly
        return response.choices[0].message.content
    except openai.APIError as e:
        return f"Error generating response: {e}"




In [29]:
# Example query
question = "What were 3M's net sales and operating income for 2015, and how did they change compared to 2014?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

In 2015, 3M's net sales were $30.274 billion and the operating income was $6.946 billion. Compared to 2014, the net sales decreased from $31.821 billion and the operating income also decreased from $7.135 billion.


In [30]:
# Example query
question = "How did 3M's geographic sales performance differ between Asia-Pacific and EMEA regions in 2020?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on 3M's geographic sales performance in the Asia-Pacific and EMEA regions for the year 2020.


In [31]:
# Example query
question = "What were the key drivers of growth in 3M’s healthcare segment in 2017 and 2021?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The key drivers of growth in 3M’s healthcare segment in 2017 were sales increases across all businesses, most notably in drug delivery systems, food safety, and medical consumables. 

In 2021, the key growth drivers included a higher year-on-year rate of dental procedures, continued high demand for biopharma filtration solutions for COVID-related vaccine and therapeutic development, an increase in elective procedure volumes in the first six months of the year, and an improvement in hospital information technology investments.


In [32]:
# Example query
question = "What restructuring actions did 3M take in 2016, and how did these impact operational margins?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not contain information about any restructuring actions taken by 3M in 2016 or their impact on operational margins.


In [33]:
# Example query
question = "How did 3M's raw material price inflation impact profitability in 2021, and what strategies did the company use to mitigate this?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

Based on the context, in 2021, 3M experienced net raw material price inflation. Although the effects on profitability are not explicitly stated, this inflation was likely to increase costs, potentially impacting profitability.

To mitigate the impact of raw material inflation, 3M deployed several strategies. This included the execution of productivity projects aimed to minimize the impact of inflation and market supply challenges. These projects encompassed input management, reformulations, and multi-sourcing activities. The company also managed existing raw material inventories carefully, established strategic relationships with key suppliers, and qualified additional supply sources to handle potential disruptions in its manufacturing operations. However, the exact effect of these strategies on the company's profitability is not provided in the context.


In [34]:
# Example query
question = "What acquisitions in 2019 and 2020 strengthened 3M's safety and industrial segment?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not contain information about any acquisitions made by 3M in 2019 and 2020 that would have strengthened its safety and industrial segment.


In [35]:
# Example query
question = "What were the significant changes in 3M's debt structure between 2015 and 2022?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

3M's debt structure underwent several significant changes both in total debt and net debt from 2015 to 2022. 

1. Total Debt: In 2015, the total debt was $10.797 billion. This increased in 2016 to $11.650 billion. By 2019, total debt had increased significantly to approximately $5.7 billion higher than it was at the end of 2018. The total debt further increased in 2020 due to issuance of $1.75 billion in registered notes. By 2021, the total debt decreased due to the March 2021 early redemption of $450 million in debt and the November 2021 repayment of 600 million euros of Eurobonds. In 2022, the total debt stood at $15.939 billion, marking a reduction of $1.424 billion from 2021.

2. Net Debt: The net debt was valued at $8.872 billion in 2015. The net debt slightly rose in 2016 to $8.955 billion. There is no specific information provided for the net debt in 2019 and 2020. In 2021, the net debt stood at $12.571 billion and by 2022, had decreased to $12.023 billion which signifies a redu

In [36]:
# Example query
question = "How did the spin-off of the Health Care segment in 2022 impact the company’s financial planning?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on how the spin-off of the Health Care segment in 2022 impacted the company’s financial planning.


In [37]:
# Example query
question = "What role did 3M's environmental investments play in 2020 and 2021?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

Based on the provided context, in 2020, 3M expended approximately $55 million on capital projects for environmental purposes. These projects include waste reduction and pollution control programs, water usage reduction and water quality improvement equipment, scrubbers, containment structures, solvent recovery units, and thermal oxidizers. For 2021, it is estimated that capital expenditures for similar environmental projects would be part of an aggregate expenditure of approximately $400 million planned for 2021 and 2022. However, the exact methodology or breakdown of 3M's environmental investments in 2021 is not provided in the context.


In [38]:
# Example query
question = "How did 3M’s Consumer Business adapt during the COVID-19 pandemic?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not contain specific information on how 3M’s Consumer Business adapted during the COVID-19 pandemic.


In [39]:
# Example query
question = "What was 3M's R&D expenditure in 2018 and 2020, and how did it impact product innovation?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided mentions that 3M's Research and Development (R&D) expenditure was $1.821 billion in 2018. However, there is no information provided in the text regarding R&D expenditure for 2020. The R&D expenditure for 2018 contributed to 3M's continued investment in key initiatives, including R&D aimed at disruptive innovation programs with the potential to create new markets and disrupt existing ones. No specific impact on product innovation for the year 2018 was given in the context. The context also does not provide information pertaining to the impact of R&D expenditure on product innovation in 2020.


In [40]:
# Example query
question = "What were the strategic benefits of 3M's divestiture of its Food Safety business in 2022?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not offer any specific details on the strategic benefits of 3M's divestiture of its Food Safety business in 2022.


In [41]:
# Example query
question = "How did 3M address workforce challenges and diversity in 2020 and 2021?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on how 3M addressed workforce challenges and diversity in 2020 and 2021.


In [42]:
# Example query
question = "What were the key differences in operating income for the Safety and Industrial segment between 2018 and 2020?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The operating income for the Safety and Industrial segment showed differences between 2018 and 2020. In 2018, the operating income for the Safety and Industrial sector was $2,737 million. By contrast, in 2020 this had increased to $3,054 million. This represents a growth of approximately 11.6% over the two-year period.


In [43]:
# Example query
question = "How did 3M's acquisition of M*Modal in 2019 enhance its health information systems?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on how 3M's acquisition of M*Modal in 2019 enhanced its health information systems.


In [44]:
# Example query
question = "What measures did 3M take to manage supply chain disruptions in 2022?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not contain information on the measures that 3M took to manage supply chain disruptions in 2022.


In [45]:
# Example query
question = "How did currency fluctuations affect 3M’s geographic performance in 2019 and 2021?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

In 2019, a stronger U.S. dollar negatively impacted sales. Even after applying the company's hedging strategy, the foreign currency effect was neutral to earnings compared to the previous year. However, the foreign currency translation resulted in an increase in year-on-year sales by 0.2 percent. The specific geographic effect is mentioned for Latin America/Canada, EMEA, and Asia Pacific areas where the translation-related sales increase in EMEA and Asia Pacific was partially offset by decreases in Latin America/Canada.

In 2021, a weaker U.S. dollar had a positive impact on sales compared to the previous year. The foreign currency, net of the Company’s hedging strategy, positively impacted earnings compared to the same period last year. Specific geographic details related to currency fluctuation impacts in 2021 are not mentioned in the provided context.


In [46]:
# Example query
question = "What advances in renewable energy components did 3M achieve in 2020?"
chunks = query_documents(question, top_k=15)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context provided does not include any specific information about the advances in renewable energy components that 3M achieved in 2020.


In [47]:
# Example query
question = "How did 3M's Consumer segment utilize e-commerce channels in 2021?"
chunks = query_documents(question, top_k=23)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on how 3M's Consumer segment utilized e-commerce channels in 2021.


In [48]:
# Example query
question = "How did 3M’s strategic focus on sustainability influence its operations in 2022?"
chunks = query_documents(question, top_k=23)
deduplicated_chunks = deduplicate_and_rank_chunks(question, chunks)
print(generate_response(question, deduplicated_chunks))

The context does not provide specific information on how 3M’s strategic focus on sustainability influenced its operations in 2022.
