In [4]:
import os
import PyPDF2
import pdfplumber
from PIL import Image
import io

def extract_text_from_pdf(pdf_path):
    """Extract text content from a PDF."""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        return " ".join(page.extract_text() for page in reader.pages)

def extract_metadata_from_pdf(pdf_path):
    """Extract metadata from a PDF."""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        return reader.metadata

def extract_tables_from_pdf(pdf_path):
    """Extract tables from a PDF using pdfplumber."""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables_on_page = page.extract_tables()
            for table in tables_on_page:
                tables.append(table)
    return tables


# Directory containing PDFs
input_dir = "/Users/yuhang/Downloads/OneDrive - The University of Chicago/MSFM /BofA Project Lab/sample_reports"
pdf_files = [f for f in os.listdir(input_dir) if f.endswith(".pdf")]

# Initialize a container for extracted data
extracted_data = {}

# Process each PDF file
for pdf_file in pdf_files:
    pdf_path = os.path.join(input_dir, pdf_file)
    print(f"Processing {pdf_file}...")
    
    text = extract_text_from_pdf(pdf_path)
    metadata = extract_metadata_from_pdf(pdf_path)
    tables = extract_tables_from_pdf(pdf_path)

    extracted_data[pdf_file] = {
        "text": text,
        "metadata": metadata,
        "tables": tables,
    }

# Save extracted data or process further
for pdf, data in extracted_data.items():
    print(f"--- Data for {pdf} ---")
    print(f"Metadata: {data['metadata']}")
    print(f"Text: {data['text'][:500]}...")  # Print first 500 characters of text
    print(f"Tables: {len(data['tables'])} table(s) found.")


Processing Nvidia.pdf...
Processing Bank of America.pdf...
Processing Meta.pdf...
Processing Tesla.pdf...
Processing AMC.pdf...
Processing Apple.pdf...
Processing Netflix.pdf...
Processing Google.pdf...
Processing Morgan Stanley.pdf...
Processing Amazon.pdf...
--- Data for Nvidia.pdf ---
Metadata: {'/Title': 'F2Q25 Preview — Increase FY26+ Estimates; Investor Sentiment Focused on Path to $5/sh.+ EPS. Reiterate OW; $155 PT', '/Author': 'Aaron Rakers', '/Subject': 'Earnings Revised', '/Creator': 'BlueMatrix', '/Producer': 'Apache FOP Version 2.8', '/CreationDate': "D:20240819050011-04'00'"}
Text:    Equity Research
Earnings Revised — August 19, 2024
 
Semiconductors
Nvidia Corporation (NVDA)
F2Q25 Preview — Increase FY26+ Estimates; Investor Sentiment Focused on Path to $5/sh.+ 
EPS. Reiterate OW; $155 PT
 
 
Our Call
Positive AI server demand data points + increasing investor optimism points to $26-$27B
+ DC upside bogey. Raise FY26+ ests on anticipated Blackwell  ramp; investor focus o

In [11]:
import json
import os

def prepare_finetune_json(extracted_data, output_file="finetune_data.json"):
    """
    Convert extracted data into JSON format for GPT fine-tuning.

    Args:
        extracted_data (dict): The extracted data with text, metadata, tables, and images.
        output_file (str): Path to the JSON file to save the output.
    """
    finetune_data = []

    for pdf, data in extracted_data.items():
        # Add metadata as a prompt-completion pair
        if data.get("metadata"):
            metadata_prompt = f"I have the following data table from a query on '{pdf}' financials and market data. Please analyze the data to output metadata analysis for this company in the style of an equity analyst, highlighting trends, comparing company to competitors, and noting any interesting insights regarding this data."
            metadata_completion = f"This is the metadata extracted for '{pdf}':\n{json.dumps(data['metadata'], indent=2)}"
            finetune_data.append({"prompt": metadata_prompt, "completion": metadata_completion})

        # Add extracted text as a prompt-completion pair
        if data.get("text"):
            text_prompt = f"I have the following data table from a query on '{pdf}' financials and market data. Please analyze the data to output text analysis for this company in the style of an equity analyst, highlighting trends, comparing company to competitors, and noting any interesting insights regarding this data." 
            text_completion = f"This is a detailed text analysis for '{pdf}':\n{json.dumps(data['text'], indent=2)}"
            finetune_data.append({"prompt": text_prompt, "completion": text_completion})

        # Add tables as prompt-completion pairs
        for i, table in enumerate(data.get("tables", [])):
            table_prompt = f"I have the following data table from a query on '{pdf}' financials and market data. Please analyze the data to outpt tables for this company in the style of an equity analyst, highlighting trends, comparing company to competitors, and noting any interesting insights regarding this data."
            table_completion = f"This is a detailed table {i+1}:\n{json.dumps(table, indent=2)} for '{pdf}'."
            finetune_data.append({"prompt": table_prompt, "completion": table_completion})


    # Save the structured data to a JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(finetune_data, f, indent=4, ensure_ascii=False)

    print(f"Fine-tuning data saved to {output_file}")


prepare_finetune_json(extracted_data, "finetune_data.json")


Fine-tuning data saved to finetune_data.json


In [12]:
import json

def convert_json_to_jsonl(json_file, jsonl_file):
    """
    Convert a JSON file into JSONL format.

    Args:
        json_file (str): Path to the input JSON file.
        jsonl_file (str): Path to the output JSONL file.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)  # Load the JSON data

    with open(jsonl_file, "w", encoding="utf-8") as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)  # Convert each JSON object to a string
            f.write("\n")  # Add a newline after each JSON object

    print(f"JSONL file saved to {jsonl_file}")

# Example usage
json_file = "finetune_data.json"
jsonl_file = "finetune_data.jsonl"
convert_json_to_jsonl(json_file, jsonl_file)


JSONL file saved to finetune_data.jsonl
