In [3]:
import os
import pytesseract
import fitz  # PyMuPDF
import pdfplumber
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text as pdfminer_extract_text
from langchain.text_splitter import RecursiveCharacterTextSplitter
import lancedb
import pyarrow as pa
from tabula import read_pdf
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# Load OpenAI API key from environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Function to generate embeddings using OpenAI
def generate_openai_embedding(text):
    """Generates embeddings using OpenAI's embedding model."""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text,
    )
    # Use dot notation to access the response attributes
    return response.data[0].embedding

print("OpenAI embedding model initialized successfully!")

# Initialize LanceDB
db = lancedb.connect("./lancedb_vectors")
schema = pa.schema([
    ("id", pa.string()),
    ("text", pa.string()),
    ("embedding", pa.list_(pa.float32(), list_size=1536)),  
])
if "document_chunks" in db.table_names():
    table = db.open_table("document_chunks")
else:
    table = db.create_table("document_chunks", schema=schema, mode="overwrite")
    print("Table 'document_chunks' created successfully!")

# Function to extract text from PDFs using multiple libraries
def extract_text_from_pdf(file_path):
    text = ""

    # Method 1: PyMuPDF
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text("text")
    except Exception as e:
        print(f"[PyMuPDF Error] {file_path}: {e}")

    # Method 2: PDFPlumber
    if not text.strip():
        try:
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
        except Exception as e:
            print(f"[PDFPlumber Error] {file_path}: {e}")

    # Method 3: PyPDF2
    if not text.strip():
        try:
            reader = PdfReader(file_path)
            text = "".join(page.extract_text() or "" for page in reader.pages)
        except Exception as e:
            print(f"[PyPDF2 Error] {file_path}: {e}")

    # Method 4: PDFMiner
    if not text.strip():
        try:
            text = pdfminer_extract_text(file_path)
        except Exception as e:
            print(f"[PDFMiner Error] {file_path}: {e}")

    # Method 5: Tabula-py for extracting tables
    if not text.strip():
        try:
            tables = read_pdf(file_path, pages='all', multiple_tables=True, pandas_options={"header": None})
            text = "\n".join(df.to_string(index=False) for df in tables)
        except Exception as e:
            print(f"[Tabula-py Error] {file_path}: {e}")

    # Fallback: OCR using PyTesseract
    if not text.strip():
        try:
            text = pytesseract.image_to_string(file_path)
        except Exception as e:
            print(f"[OCR Error] {file_path}: {e}")

    return text.strip()

# Function for chunking text dynamically
def chunk_text(text, chunk_size=800, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

# Function to filter non-informative chunks
def filter_chunks(chunks):
    return [chunk for chunk in chunks if len(chunk.strip()) > 30]

print("Text extraction utilities initialized successfully!")


OpenAI embedding model initialized successfully!
Table 'document_chunks' created successfully!
Text extraction utilities initialized successfully!


In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

# Process a single document and extract chunks
def process_document(file_path):
    text = extract_text_from_pdf(file_path)
    if not text.strip():
        return {"id": os.path.basename(file_path), "chunks": []}
    chunks = chunk_text(text)
    filtered_chunks = filter_chunks(chunks)
    return {"id": os.path.basename(file_path), "chunks": filtered_chunks}

# Generate embeddings for document chunks and add to LanceDB
def generate_embeddings_for_chunks(doc):
    chunk_texts = doc["chunks"]
    embeddings = []
    for chunk in tqdm(chunk_texts, desc=f"Embedding chunks for {doc['id']}"):
        embedding = generate_openai_embedding(chunk)
        embeddings.append(embedding)
    records = [{"id": f"{doc['id']}_chunk{i}", "text": chunk_texts[i], "embedding": embeddings[i]} for i in range(len(chunk_texts))]
    try:
        table.add(records)
        print(f"Added {len(records)} records for {doc['id']} to LanceDB.")
    except Exception as e:
        print(f"Error adding records to LanceDB for {doc['id']}: {e}")

# Process all PDFs in a directory and store embeddings
def process_pdfs(directory_path):
    pdf_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".pdf")]
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_document, file_path) for file_path in pdf_files]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing PDFs"):
            doc = future.result()
            if doc["chunks"]:
                generate_embeddings_for_chunks(doc)

# Specify the directory containing PDF files
directory_path = "../pdfs1"  # Replace with the correct directory path
process_pdfs(directory_path)


Embedding chunks for 3M_2021_10K.pdf: 100%|██████████| 1035/1035 [08:32<00:00,  2.02it/s]
Processing PDFs:  11%|█         | 1/9 [08:33<1:08:31, 513.97s/it]

Added 1035 records for 3M_2021_10K.pdf to LanceDB.


Embedding chunks for 3M_2022_10K.pdf: 100%|██████████| 1607/1607 [13:24<00:00,  2.00it/s]
Processing PDFs:  22%|██▏       | 2/9 [21:58<1:19:53, 684.83s/it]

Added 1607 records for 3M_2022_10K.pdf to LanceDB.


Embedding chunks for 3M_2023Q2_10Q.pdf: 100%|██████████| 633/633 [04:56<00:00,  2.13it/s]
Processing PDFs:  33%|███▎      | 3/9 [26:55<50:45, 507.59s/it]  

Added 633 records for 3M_2023Q2_10Q.pdf to LanceDB.


Embedding chunks for 3M_2015_10K.pdf: 100%|██████████| 1014/1014 [08:03<00:00,  2.10it/s]
Processing PDFs:  44%|████▍     | 4/9 [34:58<41:30, 498.01s/it]

Added 1014 records for 3M_2015_10K.pdf to LanceDB.


Embedding chunks for 3M_2019_10K.pdf: 100%|██████████| 1162/1162 [09:34<00:00,  2.02it/s]
Processing PDFs:  56%|█████▌    | 5/9 [44:32<35:02, 525.57s/it]

Added 1162 records for 3M_2019_10K.pdf to LanceDB.


Embedding chunks for 3M_2017_10K.pdf: 100%|██████████| 1045/1045 [08:15<00:00,  2.11it/s]
Processing PDFs:  67%|██████▋   | 6/9 [52:48<25:46, 515.43s/it]

Added 1045 records for 3M_2017_10K.pdf to LanceDB.


Embedding chunks for 3M_2016_10K.pdf: 100%|██████████| 1212/1212 [09:48<00:00,  2.06it/s]
Processing PDFs:  78%|███████▊  | 7/9 [1:02:36<17:58, 539.23s/it]

Added 1212 records for 3M_2016_10K.pdf to LanceDB.


Embedding chunks for 3M_2018_10K.pdf: 100%|██████████| 1034/1034 [08:12<00:00,  2.10it/s]
Processing PDFs:  89%|████████▉ | 8/9 [1:10:49<08:44, 524.36s/it]

Added 1034 records for 3M_2018_10K.pdf to LanceDB.


Embedding chunks for 3M_2020_10K.pdf: 100%|██████████| 1022/1022 [08:03<00:00,  2.11it/s]
Processing PDFs: 100%|██████████| 9/9 [1:18:52<00:00, 525.87s/it]

Added 1022 records for 3M_2020_10K.pdf to LanceDB.





In [9]:
from openai import OpenAI
import pandas as pd
from dotenv import load_dotenv
from rapidfuzz import fuzz  # Using rapidfuzz for improved performance

# Load OpenAI API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Query LanceDB for relevant chunks
def query_documents(question, top_k=20):
    question_embedding = generate_openai_embedding(question)  # Use OpenAI for embeddings
    results = table.search(query=question_embedding, vector_column_name="embedding").limit(top_k).to_pandas()
    return results["text"].tolist()

# Generate a response using GPT
def generate_response(question, chunks):
    context = "\n\n".join(chunks)
    instructions = (
       "You are a highly skilled financial analyst specializing in corporate financial reports. "
        "Your goal is to provide precise and concise responses to multi-hop questions about 10-K "
        "filings. You have multiple pieces of context from which you can summarize and integrate facts. "
        "Instructions:\n"
        "1. Combine relevant data from all context blocks.\n"
        "2. Provide numeric results and references when needed.\n"
        "3. If the context is insufficient or contradictory, state so.\n"
        "4. Be concise and directly address the multi-part question.\n"
        "5. Do not guess beyond the provided context."

    )
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"},
        ],
        model="gpt-4",
    )
    return response.choices[0].message.content

# Filter questions for 3M
def filter_3m_questions(json_file_path):
    df = pd.read_json(json_file_path, lines=True)
    return df[df["doc_name"].str.contains("3M", case=False, na=False)]

# Evaluate model against filtered 3M questions
def evaluate_model(json_file_path, output_file_path):
    df = filter_3m_questions(json_file_path)
    results = []

    for _, row in df.iterrows():
        question = row["question"]
        correct_answer = row["answer"]
        chunks = query_documents(question, top_k=20)
        model_answer = generate_response(question, chunks)

        # Evaluate answer accuracy using fuzzy matching
        similarity = fuzz.partial_ratio(correct_answer.lower(), model_answer.lower())
        evaluation = "Correct" if similarity > 85 else "Incorrect"

        results.append({
            "question": question,
            "correct_answer": correct_answer,
            "model_answer": model_answer,
            "evaluation": evaluation,
            "similarity_score": similarity
        })

    # Save results to JSONL file
    pd.DataFrame(results).to_json(output_file_path, orient="records", lines=True)
    print(f"Evaluation results saved to {output_file_path}")

# Evaluate against the JSON file
json_file_path = "../data/financebench_open_source.jsonl"
output_file_path = "../data/evaluation_results.jsonl"
evaluate_model(json_file_path, output_file_path)


Evaluation results saved to ../data/evaluation_results.jsonl
