In [None]:
import pdfplumber
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import uuid

# Simulated financial data for Apple Inc. (AAPL) for 2023 and 2024
# This is a placeholder; replace with actual parsed data from 10-K filings
SIMULATED_DATA = {
    "2023": {
        "income_statement": {
            "Total Revenue": 383285000000,
            "Cost of Sales": 214137000000,
            "Gross Margin": 169148000000,
            "Operating Income": 114301000000,
            "Net Income": 96995000000,
            "Research and Development": 29915000000,
            "Selling, General and Administrative": 24932000000
        },
        "balance_sheet": {
            "Total Assets": 352583000000,
            "Total Liabilities": 290437000000,
            "Total Shareholders Equity": 62146000000,
            "Cash and Cash Equivalents": 29965000000,
            "Marketable Securities": 31590000000,
            "Long-term Debt": 95281000000,
            "Accounts Receivable": 29508000000
        },
        "cash_flow": {
            "Net Cash from Operating Activities": 110543000000,
            "Net Cash from Investing Activities": -3705000000,
            "Net Cash from Financing Activities": -108488000000,
            "Capital Expenditures": -10959000000
        }
    },
    "2024": {
        "income_statement": {
            "Total Revenue": 391678000000,
            "Cost of Sales": 218207000000,
            "Gross Margin": 173471000000,
            "Operating Income": 119558000000,
            "Net Income": 100389000000,
            "Research and Development": 31740000000,
            "Selling, General and Administrative": 25210000000
        },
        "balance_sheet": {
            "Total Assets": 367555000000,
            "Total Liabilities": 298437000000,
            "Total Shareholders Equity": 69118000000,
            "Cash and Cash Equivalents": 30736000000,
            "Marketable Securities": 32810000000,
            "Long-term Debt": 91207000000,
            "Accounts Receivable": 31235000000
        },
        "cash_flow": {
            "Net Cash from Operating Activities": 115647000000,
            "Net Cash from Investing Activities": -4567000000,
            "Net Cash from Financing Activities": -112345000000,
            "Capital Expenditures": -11234000000
        }
    }
}

df_2023 = pd.read_excel("2023.xlsx")
text_2023 = df_2023.to_string(index=False)
print(text_2023)

df_2024 = pd.read_excel("2024.xlsx")
text_2024 = df_2024.to_string(index=False)
print(text_2024)

# Function to parse PDF filing
def parse_pdf_filing(file_path):
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error parsing PDF: {e}")
    return text

# Function to clean extracted text
def clean_text(text):
    # Remove headers, footers, page numbers, and other noise
    text = re.sub(r"Page \d+ of \d+", "", text)  # Remove page numbers
    text = re.sub(r"\n\s*\n", "\n", text)  # Remove excessive newlines
    text = re.sub(r"^\s*Table of Contents\s*$", "", text, flags=re.MULTILINE)  # Remove table of contents
    text = re.sub(r"^\s*Form 10-K\s*$", "", text, flags=re.MULTILINE)  # Remove form headers
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with single space
    return text.strip()

# Function to segment financial statements
def segment_financials(text):
    sections = {
        "income_statement": [],
        "balance_sheet": [],
        "cash_flow": [],
        "equity": []
    }
    current_section = None
    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        if "CONSOLIDATED STATEMENTS OF OPERATIONS" in line.upper():
            current_section = "income_statement"
        elif "CONSOLIDATED BALANCE SHEETS" in line.upper():
            current_section = "balance_sheet"
        elif "CONSOLIDATED STATEMENTS OF CASH FLOWS" in line.upper():
            current_section = "cash_flow"
        elif "CONSOLIDATED STATEMENTS OF SHAREHOLDERS’ EQUITY" in line.upper():
            current_section = "equity"
        elif current_section and line:
            sections[current_section].append(line)

    return sections

# Function to extract financial data (simplified; assumes structured data extraction)
def extract_financial_data(sections, year):
    # In practice, use regex or table parsing (e.g., pdfplumber.extract_table) to extract numbers
    # Here, we return simulated data for demonstration
    return SIMULATED_DATA.get(year, {})

# Function to generate Q/A pairs
def generate_qa_pairs(data):
    qa_pairs = []
    for year in data:
        income = data[year]["income_statement"]
        balance = data[year]["balance_sheet"]
        cash_flow = data[year]["cash_flow"]

        # Income Statement Q/A
        qa_pairs.append({
            "question": f"What was the company’s total revenue in {year}?",
            "answer": f"The company’s total revenue in {year} was ${income['Total Revenue'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s cost of sales in {year}?",
            "answer": f"The company’s cost of sales in {year} was ${income['Cost of Sales'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s gross margin in {year}?",
            "answer": f"The company’s gross margin in {year} was ${income['Gross Margin'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s operating income in {year}?",
            "answer": f"The company’s operating income in {year} was ${income['Operating Income'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s net income in {year}?",
            "answer": f"The company’s net income in {year} was ${income['Net Income'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s R&D expenses in {year}?",
            "answer": f"The company’s R&D expenses in {year} were ${income['Research and Development'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s SG&A expenses in {year}?",
            "answer": f"The company’s SG&A expenses in {year} were ${income['Selling, General and Administrative'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the gross margin percentage in {year}?",
            "answer": f"The gross margin percentage in {year} was {(income['Gross Margin'] / income['Total Revenue'] * 100):.2f}%."
        })
        qa_pairs.append({
            "question": f"What was the operating margin in {year}?",
            "answer": f"The operating margin in {year} was {(income['Operating Income'] / income['Total Revenue'] * 100):.2f}%."
        })
        qa_pairs.append({
            "question": f"What was the net profit margin in {year}?",
            "answer": f"The net profit margin in {year} was {(income['Net Income'] / income['Total Revenue'] * 100):.2f}%."
        })

        # Balance Sheet Q/A
        qa_pairs.append({
            "question": f"What were the company’s total assets in {year}?",
            "answer": f"The company’s total assets in {year} were ${balance['Total Assets'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s total liabilities in {year}?",
            "answer": f"The company’s total liabilities in {year} were ${balance['Total Liabilities'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s shareholders’ equity in {year}?",
            "answer": f"The company’s shareholders’ equity in {year} was ${balance['Total Shareholders Equity'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s cash and cash equivalents in {year}?",
            "answer": f"The company’s cash and cash equivalents in {year} were ${balance['Cash and Cash Equivalents'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s marketable securities in {year}?",
            "answer": f"The company’s marketable securities in {year} were ${balance['Marketable Securities'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the company’s long-term debt in {year}?",
            "answer": f"The company’s long-term debt in {year} was ${balance['Long-term Debt'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s accounts receivable in {year}?",
            "answer": f"The company’s accounts receivable in {year} were ${balance['Accounts Receivable'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the debt-to-equity ratio in {year}?",
            "answer": f"The debt-to-equity ratio in {year} was {(balance['Long-term Debt'] / balance['Total Shareholders Equity']):.2f}."
        })
        qa_pairs.append({
            "question": f"What was the current ratio in {year} (assuming current liabilities are half of total liabilities)?",
            "answer": f"The current ratio in {year} was {((balance['Cash and Cash Equivalents'] + balance['Marketable Securities'] + balance['Accounts Receivable']) / (balance['Total Liabilities'] / 2)):.2f}."
        })
        qa_pairs.append({
            "question": f"What was the company’s total cash and investments in {year}?",
            "answer": f"The company’s total cash and investments in {year} were ${(balance['Cash and Cash Equivalents'] + balance['Marketable Securities']) / 1e9:.2f} billion."
        })

        # Cash Flow Q/A
        qa_pairs.append({
            "question": f"What was the net cash from operating activities in {year}?",
            "answer": f"The net cash from operating activities in {year} was ${cash_flow['Net Cash from Operating Activities'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the net cash from investing activities in {year}?",
            "answer": f"The net cash from investing activities in {year} was ${cash_flow['Net Cash from Investing Activities'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the net cash from financing activities in {year}?",
            "answer": f"The net cash from financing activities in {year} was ${cash_flow['Net Cash from Financing Activities'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What were the company’s capital expenditures in {year}?",
            "answer": f"The company’s capital expenditures in {year} were ${cash_flow['Capital Expenditures'] / 1e9:.2f} billion."
        })
        qa_pairs.append({
            "question": f"What was the free cash flow in {year}?",
            "answer": f"The free cash flow in {year} was ${(cash_flow['Net Cash from Operating Activities'] + cash_flow['Capital Expenditures']) / 1e9:.2f} billion."
        })

    return qa_pairs

# Main function
def main():
    # Step 1: Download filings (simulated with sample URLs)
    ticker = "AAPL"
    years = ["2023", "2024"]
    financial_data = {}

    for year in years:
        # Replace with actual file path or URL for real filings
        # Example: html_content = download_filing(ticker, year)
        # For demonstration, use simulated data
        financial_data[year] = SIMULATED_DATA[year]

    # Step 2: Parse and clean (simulated; replace with actual parsing)
    # Example for PDF: text = parse_pdf_filing("path/to/10k.pdf")
    # Example for HTML: text = parse_html_filing(html_content)
    # cleaned_text = clean_text(text)

    # Step 3: Segment (simulated; replace with actual segmentation)
    # sections = segment_financials(cleaned_text)

    # Step 4: Extract data (using simulated data)
    # financial_data = {year: extract_financial_data(sections, year) for year in years}

    # Step 5: Generate Q/A pairs
    qa_pairs = generate_qa_pairs(financial_data)

    # Save Q/A pairs to a file
    with open("financial_qa_pairs.txt", "w") as f:
        for i, pair in enumerate(qa_pairs, 1):
            f.write(f"Q{i}: {pair['question']}\n")
            f.write(f"A{i}: {pair['answer']}\n\n")

    # Print first few Q/A pairs
    for i, pair in enumerate(qa_pairs[:5], 1):
        print(f"Q{i}: {pair['question']}")
        print(f"A{i}: {pair['answer']}\n")

if __name__ == "__main__":
    main()