In [1]:
%pip install pytesseract
%pip install PyMuPDF
%pip install pdf2image
%pip install pandas
%pip install pdfplumber

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m 

In [5]:
import requests
import pdfplumber
from bs4 import BeautifulSoup
import re
import os
from io import StringIO
from google.colab import drive
import os

import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


# Mount Google Drive
drive.mount('/content/drive')

# Define the base folder where your PDFs are stored in Google Drive
base_folder = "/content/drive/MyDrive/output"
drive_folder = "/content/drive/MyDrive/output"
# Step 2: Convert PDF or HTML to text and extract tables
def convert_to_text_and_tables(file_path, file_type):
    logger.info(f"Converting {file_type} to text and extracting tables from {file_path}")
    text = ""
    tables = []
    page_texts = []
    if file_type == "pdf":
        try:
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    # Extract text
                    extracted_text = page.extract_text()
                    if extracted_text:
                        text += f"\n[Page {page_num}]\n{extracted_text}\n"
                        page_texts.append((page_num, extracted_text))
                    else:
                        page_texts.append((page_num, ""))
                    # Extract tables with improved settings
                    page_tables = page.extract_tables(table_settings={
                        "vertical_strategy": "lines_strict",
                        "horizontal_strategy": "lines_strict",
                        "snap_tolerance": 3
                    })
                    for table in page_tables:
                        if table and any(row for row in table if any(cell for cell in row)):  # Ensure table is not empty
                            tables.append((page_num, table))
        except Exception as e:
            logger.error(f"PDF extraction failed: {e}")
    elif file_type == "html":
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                text = soup.get_text(separator=' ', strip=True)
                page_texts.append((1, text))
                # Extract HTML tables
                html_tables = soup.find_all('table')
                for i, html_table in enumerate(html_tables, 1):
                    table_data = []
                    for row in html_table.find_all('tr'):
                        row_data = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
                        if row_data:
                            table_data.append(row_data)
                    if table_data:
                        tables.append((i, table_data))
        except Exception as e:
            logger.error(f"HTML extraction failed: {e}")

    else:
        logger.error(f"Unsupported file type: {file_type}")
    return text, tables, page_texts

# Step 3: Clean text and table data
def clean_text(text):
    logger.info("Cleaning text...")
    # Remove page numbers
    text = re.sub(r'Page \d+ of \d+', '', text)
    # Remove Infosys-specific headers
    text = re.sub(r'Infosys Limited\s+\d{4}\s+Form 20-F', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Infosys Integrated Annual Report \d{4}-\d{2}', '', text, flags=re.IGNORECASE)
    # Remove boilerplate text
    text = re.sub(r'This document contains forward-looking statements.*?(?=\.)', '', text, flags=re.DOTALL)
    text = re.sub(r'Pursuant to the requirements of the Securities Exchange Act.*?(?=\.)', '', text, flags=re.DOTALL)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    logger.info("Text cleaned successfully")
    return text

def clean_table(table):
    logger.info("Cleaning table data...")
    if not table:
        return []
    # Handle None values in cells and normalize
    cleaned_table = []
    for row in table:
        cleaned_row = [str(cell).strip() if cell is not None else '' for cell in row]
        if any(cell for cell in cleaned_row):  # Skip entirely empty rows
            cleaned_table.append(cleaned_row)

    # Remove common table noise
    try:
        cleaned_table = [
            row for row in cleaned_table
            if not re.match(r'(?i)(in millions|note \d+|see accompanying notes|total assets|total liabilities)', ' '.join(row))
        ]
    except Exception as e:
        print(f"Error cleaning table: {e}")
        return []
    logger.info("Table cleaned successfully")
    return cleaned_table

# Step 4: Convert table to text format
def table_to_text(table):
    logger.info("Converting table to text format...")
    if not table:
        return ""
    output = StringIO()
    for row in table:
        cleaned_row = [cell if cell else '' for cell in row]
        output.write('\t'.join(cleaned_row) + '\n')
    logger.info("Table converted to text format successfully")
    return output.getvalue()


# Step 5: Segment text and associate tables with sections
def segment_report(text, tables, page_texts):
    sections = {
        "income_statement": {"text": "", "tables": []},
        "balance_sheet": {"text": "", "tables": []}
    }

    # Enhanced regex patterns
    patterns = {
        "income_statement": r'(?i)(Consolidated Statement of (Profit and Loss|Profit or Loss|Income|Comprehensive Income))\b.*?(?=(Consolidated (Balance Sheet|Statement of (Cash Flows|Financial Position|Changes in Equity))|$))',
        "balance_sheet": r'(?i)(Consolidated Balance Sheet)\b.*?(?=(Consolidated Statement of (Cash Flows|Changes in Equity)|$))',
        #"cash_flow_statement": r'(?i)(Consolidated Statement of Cash Flows)\b.*?(?=(Consolidated Statement of|$))'
    }

    current_section = None
    section_text = []
    section_start_page = 0
    used_tables = set()  # Track used table indices

    # Segment text and associate tables
    for page_num, page_text in page_texts:
        page_text_clean = clean_text(page_text)
        for section, pattern in patterns.items():
            if re.search(pattern, page_text_clean, re.IGNORECASE):
                if current_section and section_text:
                    sections[current_section]["text"] = ' '.join(section_text).strip()
                    # Assign tables from section_start_page to current page
                    for table_page, table in tables:
                        if section_start_page <= table_page < page_num and table_page not in used_tables:
                            cleaned_table = clean_table(table)
                            if cleaned_table:
                                sections[current_section]["tables"].append(cleaned_table)
                                used_tables.add(table_page)
                current_section = section
                section_start_page = page_num
                section_text = [page_text_clean]
                break
        if current_section:
            section_text.append(page_text_clean)

    # Add final section text and tables
    if current_section and section_text:
        sections[current_section]["text"] = ' '.join(section_text).strip()
        for table_page, table in tables:
            if table_page >= section_start_page and table_page not in used_tables:
                cleaned_table = clean_table(table)
                if cleaned_table:
                    sections[current_section]["tables"].append(cleaned_table)
                    used_tables.add(table_page)

    # Log debug output if no sections found
    if not any(sections[section]["text"] or sections[section]["tables"] for section in sections):
        debug_file = f"infosys_debug_{filing.get('year', 'unknown')}.txt"
        print(f"Warning: No sections or tables found. Saving all text and tables to {debug_file}.")
        with open(debug_file, 'w', encoding='utf-8') as f:
            f.write("=== Debug Output ===\n\n")
            f.write("All Text Content:\n")
            f.write(text + "\n\n")
            f.write("All Tables:\n")
            for table_page, table in tables:
                f.write(f"Table on Page {table_page}:\n")
                f.write(table_to_text(clean_table(table)))
                f.write("\n")

    return sections

#Step 6: Save combined text and table data to text files
def save_section_to_text(section_name, section_data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"=== {section_name.replace('_', ' ').title()} ===\n\n")
        if section_data["text"]:
            f.write("Text Content:\n")
            f.write(section_data["text"] + "\n\n")
        else:
            f.write("Text Content: Not found\n\n")

        f.write("Table Content:\n")
        if section_data["tables"]:
            for i, table in enumerate(section_data["tables"], 1):
                f.write(f"Table {i}:\n")
                f.write(table_to_text(table))
                f.write("\n")
        else:
            f.write("No tables found.\n")

# Main execution
def main():
    global filing
    # Define filings for Infosys 2023 and 2024
    filings = [

        {
            "year": 2023,
            "url": "infosys_2023.pdf",  # Local file from provided document
            "file_path": os.path.join(base_folder, "infosys_2023.pdf"),
            "file_type": "pdf"
        },
        {
            "year": 2024,
            "url": "infosys_2024.pdf",  # Local file from provided document
            "file_path": os.path.join(base_folder, "infosys_2024.pdf"),
            "file_type": "pdf"
        },

    ]

    for filing in filings:
        print(f"\nProcessing Infosys {filing['year']} filing...")

        # Check if file exists locally
        file_path = filing['file_path']


        # Convert to text and extract tables
        print("Converting to text and extracting tables...")
        raw_text, tables, page_texts = convert_to_text_and_tables(file_path, filing['file_type'])
        if not raw_text and not tables:
            print(f"No text or tables extracted for {filing['year']}.")
            continue

        # Clean text
        print("Cleaning text...")
        cleaned_text = clean_text(raw_text)

        # Segment report
        print("Segmenting report...")
        sections = segment_report(cleaned_text, tables, page_texts)

        # Output results and save to files
        for section, content in sections.items():
            print(f"\n=== {filing['year']} {section.replace('_', ' ').title()} ===")
            print("Text:", content["text"][:500] + "..." if content["text"] else "Section text not found.")
            print("Tables:", len(content["tables"]), "tables found.")

            # Save combined text and table data to a single text file
            output_file = os.path.join(drive_folder, f"infosys_{filing['year']}_{section}.txt")
            save_section_to_text(section, content, output_file)



if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Processing Infosys 2023 filing...
Converting to text and extracting tables...
Cleaning text...
Segmenting report...

=== 2023 Income Statement ===
Text: Income tax expense in the consolidated statement of comprehensive income comprises: (Dollars in millions) Year ended March 31, 2023 2022 2021 Current taxes Domestic taxes 830 785 716 Foreign taxes 323 263 185 1,153 1,048 901 Deferred taxes Domestic taxes 54 48 85 Foreign taxes (65) (28) (13) (11) 20 72 Income tax expense 1,142 1,068 973 Income tax expense for fiscal 2023, 2022 and 2021 includes reversals (net of provisions) of $13 million, $36 million and $47 million, respectively. These reversa...
Tables: 2 tables found.

=== 2023 Balance Sheet ===
Text: (1) Financial Services include enterprises in Financial Services and Insurance (2) Retail includes enterprises in Retail, Consumer Packaged Goods and Logis

Construct at least 50 question-answer (Q/A) pairs reflecting the financial data.
Example:
Q: What was the company’s revenue in 2023?
A: The company’s revenue in 2023 was $4.13 billion.

In [6]:
import os
import re

def extract_financial_data_from_text(text):
    """Extracts key financial data points from the text using regex patterns."""
    data = {}

    # Example patterns for text (these will need to be refined based on actual report structure)
    # Look for revenue
    revenue_match = re.search(r'(?i)revenue\s+.*?(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s+million', text)
    if revenue_match:
        data['Revenue'] = revenue_match.group(1) + " million"

    # Look for net income/profit
    net_income_match = re.search(r'(?i)(?:net income|profit)\s+.*?(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s+million', text)
    if net_income_match:
        data['Net Income'] = net_income_match.group(1) + " million"

    # Look for total assets
    assets_match = re.search(r'(?i)total assets\s+.*?(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s+million', text)
    if assets_match:
        data['Total Assets'] = assets_match.group(1) + " million"

    # Look for total liabilities
    liabilities_match = re.search(r'(?i)total liabilities\s+.*?(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s+million', text)
    if liabilities_match:
        data['Total Liabilities'] = liabilities_match.group(1) + " million"

    # Look for cash and cash equivalents
    cash_match = re.search(r'(?i)cash and cash equivalents\s+.*?(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s+million', text)
    if cash_match:
        data['Cash and Cash Equivalents'] = cash_match.group(1) + " million"


    return data

def extract_financial_data_from_tables(tables, section_name, year):
    """Extracts financial data points from tables with improved logic."""
    data = {}
    for table in tables:
        if not table:
            continue
        try:
            header = []
            rows = []
            # Assuming the first row is the header
            if table and table[0]:
                header = [cell.strip() if cell else '' for cell in table[0]]
                rows = table[1:]

            # Find the column index for the specified year
            year_col_index = -1
            for i, col_header in enumerate(header):
                if str(year) in col_header:
                    year_col_index = i
                    break

            if year_col_index != -1:
                for row in rows:
                    if row and len(row) > year_col_index:
                        key = row[0].strip() if row[0] else ''
                        value = row[year_col_index].strip() if len(row) > year_col_index and row[year_col_index] else ''

                        if key and value and not re.search(r'(?i)(in millions|note \d+|see accompanying notes|total assets|total liabilities)', key):
                             # Clean and format the key
                            cleaned_key = re.sub(r'\(.*?\)', '', key).strip() # Remove anything in parentheses
                            cleaned_key = re.sub(r'\[.*?\]', '', cleaned_key).strip() # Remove anything in brackets
                            cleaned_key = re.sub(r'\,', '', cleaned_key).strip() # Remove commas

                            # Add the key-value pair if the value is not empty
                            if cleaned_key and value:
                                data[cleaned_key] = value

        except Exception as e:
            print(f"Error processing table: {e}")
            continue
    return data


def generate_qa_pairs(financial_data, year, section_name):
    """Generates Q/A pairs from extracted financial data."""
    qa_pairs = []
    for key, value in financial_data.items():
        # Avoid generating generic questions for keys without meaningful values
        if value and value.strip():
            question = f"According to the {section_name.replace('_', ' ')}, what was the {key.lower()} in {year}?"
            answer = f"The {key.lower()} in {year} was {value}."
            qa_pairs.append({"Q": question, "A": answer})
    return qa_pairs

# Define the folder where the processed text files are saved
processed_data_folder = "/content/drive/MyDrive/output" # Assuming saved in the same folder as the original PDF

all_qa_pairs = []

for filing_year in [2023,2024]: # Process for the year 2023
    for section in ["income_statement", "balance_sheet"]:
        file_name = f"infosys_{filing_year}_{section}.txt"
        file_path = os.path.join(processed_data_folder, file_name)

        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Extract text content from the file
            text_content_match = re.search(r'Text Content:\n(.*?)\n\nTable Content:', content, re.DOTALL)
            text_content = text_content_match.group(1).strip() if text_content_match else ""

            # Extract table content from the file
            table_content_match = re.search(r'Table Content:\n(.*)', content, re.DOTALL)
            table_content = table_content_match.group(1).strip() if table_content_match else ""

            # Parse table content (simple parsing assuming tab-separated values)
            tables_from_file = []
            current_table = []
            if table_content:
                for line in table_content.split('\n'):
                    if line.startswith('Table'):
                        if current_table:
                            tables_from_file.append(current_table)
                        current_table = []
                    elif line.strip(): # Avoid adding empty lines
                         current_table.append(line.split('\t'))
                if current_table:
                    tables_from_file.append(current_table)

            # Extract financial data from the text
            financial_data_from_text = extract_financial_data_from_text(text_content)
            qa_pairs_from_text = generate_qa_pairs(financial_data_from_text, filing_year, section)
            all_qa_pairs.extend(qa_pairs_from_text)

            # Extract financial data from tables
            financial_data_from_tables = extract_financial_data_from_tables(tables_from_file, section, filing_year)
            qa_pairs_from_tables = generate_qa_pairs(financial_data_from_tables, filing_year, section)
            all_qa_pairs.extend(qa_pairs_from_tables)


# Print the generated Q/A pairs
print(f"Generated {len(all_qa_pairs)} Q/A pairs:")
for i, qa in enumerate(all_qa_pairs):
    print(f"{i+1}. Q: {qa['Q']}")
    print(f"   A: {qa['A']}")

# You can further process or save the all_qa_pairs list as needed

Generated 6 Q/A pairs:
1. Q: According to the income statement, what was the revenue in 2023?
   A: The revenue in 2023 was $846 million.
2. Q: According to the income statement, what was the net income in 2023?
   A: The net income in 2023 was $13 million.
3. Q: According to the balance sheet, what was the revenue in 2023?
   A: The revenue in 2023 was $669 million.
4. Q: According to the income statement, what was the revenue in 2024?
   A: The revenue in 2024 was $940 million.
5. Q: According to the income statement, what was the net income in 2024?
   A: The net income in 2024 was $196 million.
6. Q: According to the balance sheet, what was the revenue in 2024?
   A: The revenue in 2024 was $656 million.


New Code

In [7]:
import pdfplumber
import re
import os
from io import StringIO
import logging
from google.colab import drive
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


# Mount Google Drive
drive.mount('/content/drive')

# Define the base folder where your PDFs are stored in Google Drive
base_folder = "/content/drive/MyDrive/output"
qa_folder = "/content/drive/MyDrive/qa"
# Define regex patterns
note_pattern = r'(\d+\.\d+(?:,\s*\d+\.\d+)*\s*(?:and\s*\d+\.\d+)?)?'  # Matches 2.1, 2.12, 2.6, 2.7 and 2.18
number_pattern = r'-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\(\d{1,3}(?:,\d{3})*\)|\d+\s+\d+'  # Matches 2,861, -4,579, (2), or "2 305"
item_pattern = r'^(.*?)(?:\s+(' + note_pattern + r'))?\s*(' + number_pattern + r')?\s*(' + number_pattern + r')?\s*$'  # Flexible matching
header_pattern_balance = r'Consolidated Balance Sheet as of March 31'  # Identifies the balance sheet section
header_pattern_income = r'Consolidated Statements of Comprehensive Income for the years ended March 31'  # Identifies the income statement section

# Initialize debug log
debug_log = []

def process_line(line):
    """Process a single line using regex to extract item, note, and values."""
    line = line.strip()
    if not line:
        return None
    # Check if line is a section header (e.g., ASSETS, Current assets, Revenues)
    if line.isupper() or any(keyword in line.lower() for keyword in ["assets", "liabilities", "equity", "investments", "capital", "revenues", "expenses", "income","profit","loss","earnings","baisc","tax","net""current"]):
        return f"\n{line}"
    # Try to match line items with regex
    try:
        match = re.match(item_pattern, line, re.IGNORECASE)
        if match:
            groups = match.groups()
            if len(groups) > 4:
                debug_log.append(f"Too many groups in line: '{line}' -> {groups}")
                return None
            item, note, val_2024, val_2023 = (groups + (None, None, None, None))[:4]  # Pad with None
            item = item.strip() if item else ""
            note = note if note else ""
            val_2024 = val_2024 if val_2024 else ""
            val_2023 = val_2023 if val_2023 else ""
            if item and (val_2024 or val_2023 or note):
                return f"{item}\t{note}\t{val_2024}\t{val_2023}"
            else:
                debug_log.append(f"Skipped line (no meaningful data): '{line}'")
        else:
            debug_log.append(f"No match for line: '{line}'")
    except Exception as e:
        debug_log.append(f"Error processing line '{line}': {str(e)}")
    return None

def convert_to_text_and_tables(file_path, file_type):
    logger.info(f"Converting {file_type} to text and extracting tables from {file_path}")
    text = ""
    tables = []
    page_texts = []
    balance_sheet_lines = ["Consolidated Balance Sheet as of March 31 (Dollars in millions except equity share data)"]
    balance_sheet_lines.append("Item\tNote\t2024\t2023")
    income_statement_lines = ["Consolidated Statements of Comprehensive Income for the years ended March 31"]
    income_statement_lines.append("Item\tNote\t2024\t2023")

    if file_type == "pdf":
        try:
            with pdfplumber.open(file_path) as pdf:
                found_balance_sheet = False
                found_income_statement = False
                for page_num, page in enumerate(pdf.pages, 1):
                    # Extract text with layout=True to preserve formatting
                    extracted_text = page.extract_text(layout=True)
                    if extracted_text:
                        text += f"\n[Page {page_num}]\n{extracted_text}\n"
                        page_texts.append((page_num, extracted_text))
                        # Process text for balance sheet or income statement if header is found
                        if re.search(header_pattern_balance, extracted_text, re.IGNORECASE):
                            found_balance_sheet = True
                            for line in extracted_text.splitlines():
                                processed = process_line(line)
                                if processed:
                                    balance_sheet_lines.append(processed)
                        elif re.search(header_pattern_income, extracted_text, re.IGNORECASE):
                            found_income_statement = True
                            for line in extracted_text.splitlines():
                                processed = process_line(line)
                                if processed:
                                    income_statement_lines.append(processed)
                    else:
                        page_texts.append((page_num, ""))

                    # Extract tables as fallback
                    page_tables = page.extract_tables(table_settings={
                        "vertical_strategy": "lines_strict",
                        "horizontal_strategy": "lines_strict",
                        "snap_tolerance": 3,
                        "join_tolerance": 3
                    })
                    for table in page_tables:
                        if table and any(row for row in table if any(cell for cell in row if cell)):
                            tables.append((page_num, table))
                            logger.info(f"Extracted table on page {page_num}")
                            # Process table rows for balance sheet or income statement if header was found
                            if found_balance_sheet:
                                for row in table:
                                    row_text = " ".join(str(cell) for cell in row if cell)
                                    processed = process_line(row_text)
                                    if processed:
                                        balance_sheet_lines.append(processed)
                            elif found_income_statement:
                                for row in table:
                                    row_text = " ".join(str(cell) for cell in row if cell)
                                    processed = process_line(row_text)
                                    if processed:
                                        income_statement_lines.append(processed)

                if not found_balance_sheet:
                    logger.warning("Consolidated Balance Sheet not found in the PDF")
                if not found_income_statement:
                    logger.warning("Consolidated Statements of Comprehensive Income not found in the PDF")
        except Exception as e:
            logger.error(f"PDF extraction failed: {e}")

    return text, tables, page_texts, balance_sheet_lines, income_statement_lines

def clean_table(table):
    logger.info("Cleaning table data...")
    if not table:
        return []
    cleaned_table = []
    for row in table:
        cleaned_row = [str(cell).strip() if cell is not None else '' for cell in row]
        if any(cell for cell in cleaned_row):
            # Process each row as a line to apply regex
            row_text = " ".join(cleaned_row)
            processed = process_line(row_text)
            if processed:
                cleaned_table.append(processed.split("\t"))
    logger.info("Table cleaned successfully")
    return cleaned_table

def table_to_text(table):
    logger.info("Converting table to text format...")
    if not table:
        return ""
    output = StringIO()
    for row in table:
        output.write('\t'.join(str(cell) if cell else '' for cell in row) + '\n')
    logger.info("Table converted to text format successfully")
    return output.getvalue()

def clean_text(text):
    logger.info("Cleaning text...")
    text = re.sub(r'Page \d+ of \d+', '', text)
    text = re.sub(r'Infosys Limited\s+\d{4}\s+Form 20-F', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Infosys Integrated Annual Report \d{4}-\d{2}', '', text, flags=re.IGNORECASE)
    text = re.sub(r'This document contains forward-looking statements.*?(?=\.)', '', text, flags=re.DOTALL)
    text = re.sub(r'Pursuant to the requirements of the Securities Exchange Act.*?(?=\.)', '', text, flags=re.DOTALL)
    text = re.sub(r'\s+', ' ', text).strip()
    logger.info("Text cleaned successfully")
    return text

def segment_report(text, tables, page_texts, balance_sheet_lines, income_statement_lines):
    logger.info("Segmenting report...")
    sections = {
        "balance_sheet": {"text": "", "tables": [], "lines": balance_sheet_lines},
        "income_statement": {"text": "", "tables": [], "lines": income_statement_lines}
    }

    # Enhanced regex patterns
    patterns = {
        "balance_sheet": r'(?i)(Consolidated Balance Sheet as of March 31,)\b.*?(?=(Consolidated Statement of (Cash Flows|Changes in Equity|Comprehensive Income)|$))',
        "income_statement": r'(?i)(Consolidated Statements of Comprehensive Income for the years ended March 31)\b.*?(?=(Consolidated (Balance Sheet|Statement of (Cash Flows|Changes in Equity))|$))'
    }

    current_section = None
    section_text = []
    section_start_page = 0
    used_tables = set()

    for page_num, page_text in page_texts:
        if page_num < 50:  # Skip early pages
            continue
        page_text_clean = clean_text(page_text)
        for section, pattern in patterns.items():
            if re.search(pattern, page_text_clean, re.IGNORECASE):
                logger.info(f"Found {section} on page {page_num}: {page_text_clean[:100]}...")
                if current_section and section_text:
                    sections[current_section]["text"] = ' '.join(section_text).strip()
                    for table_page, table in tables:
                        if section_start_page <= table_page < page_num and table_page not in used_tables:
                            cleaned_table = clean_table(table)
                            if cleaned_table:
                                sections[current_section]["tables"].append(cleaned_table)
                                used_tables.add(table_page)
                                logger.info(f"Assigned table on page {table_page} to {current_section}")
                current_section = section
                section_start_page = page_num
                section_text = [page_text_clean]
                break
        if current_section:
            section_text.append(page_text_clean)

    if current_section and section_text:
        sections[current_section]["text"] = ' '.join(section_text).strip()
        for table_page, table in tables:
            if table_page >= section_start_page and table_page not in used_tables:
                cleaned_table = clean_table(table)
                if cleaned_table:
                    sections[current_section]["tables"].append(cleaned_table)
                    used_tables.add(table_page)
                    logger.info(f"Assigned table on page {table_page} to {current_section}")

    # Log debug output
    debug_file = os.path.join(base_folder, "infosys_debug_2024.txt")
    with open(debug_file, 'w', encoding='utf-8') as f:
        f.write("=== Debug Output ===\n\n")
        f.write("Section Matches:\n")
        for section, content in sections.items():
            f.write(f"{section}:\n")
            f.write(f"Text: {content['text'][:500] + '...' if content['text'] else 'Not found'}\n")
            f.write(f"Tables: {len(content['tables'])} found\n")
            f.write("Extracted Lines:\n")
            f.write('\n'.join(content['lines']) + '\n\n')
        f.write("All Tables:\n")
        for table_page, table in tables:
            f.write(f"Table on Page {table_page}:\n")
            f.write(table_to_text(clean_table(table)))
            f.write("\n")
        f.write("Debug Log:\n")
        for log_entry in debug_log:
            f.write(log_entry + "\n")

    logger.info("Report segmentation completed")
    return sections

def save_section_to_text(section_name, section_data, output_file, output_folder="output"):
    logger.info(f"Saving section {section_name} to {output_file} in folder {output_folder}")
    # Ensure the output folder exists
    #os.makedirs(output_folder, exist_ok=True)
    # Construct the full file path
    full_output_path = os.path.join(base_folder, output_file)

    with open(os.path.join(base_folder, output_file), 'w', encoding='utf-8') as f:
        f.write(f"=== {section_name.replace('_', ' ').title()} ===\n\n")
        f.write("Text Content:\n")
        f.write(section_data["text"] + "\n\n" if section_data["text"] else "Text Content: Not found\n\n")
        f.write("Table Content:\n")
        if section_data["lines"]:
            f.write('\n'.join(section_data["lines"]) + "\n")
        else:
            f.write(f"No {section_name.replace('_', ' ')} lines extracted.\n")
    logger.info(f"Saved section {section_name} successfully to {full_output_path}")


def main():
    global filing
    filings = [
        {
            "year": 2024,
            "url": "infosys_2024.pdf",
            "file_path":  os.path.join(base_folder, "infosys_2024.pdf"),
            "file_type": "pdf"
        },
        {
            "year": 2023,
            "url": "infosys_2023.pdf",
            "file_path":  os.path.join(base_folder, "infosys_2023.pdf"),
            "file_type": "pdf"
        }
    ]

    for filing in filings:
        logger.info(f"Processing Infosys {filing['year']} filing...")

        file_path = filing['file_path']
        year = os.path.basename(file_path).split("_")[1].split(".")[0]
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            continue

        # Convert to text and extract tables
        logger.info("Converting to text and extracting tables...")
        raw_text, tables, page_texts, balance_sheet_lines, income_statement_lines = convert_to_text_and_tables(file_path, filing['file_type'])
        if not raw_text and not tables and not balance_sheet_lines and not income_statement_lines:
            logger.error(f"No text, tables, balance sheet, or income statement lines extracted for {filing['year']}.")
            continue

        # Clean text
        logger.info("Cleaning text...")
        cleaned_text = clean_text(raw_text)
        with open(os.path.join(base_folder, "infosys_"+year+"_clean.txt"), "w", encoding="utf-8") as f:
            f.write(cleaned_text)
        logger.info(f"Saved cleaned text to output/infosys_{year}_clean.txt")
        # Segment report
        logger.info("Segmenting report...")
        sections = segment_report(cleaned_text, tables, page_texts, balance_sheet_lines, income_statement_lines)

        # Output results and save to files
        for section, content in sections.items():
            print(f"\n=== {filing['year']} {section.replace('_', ' ').title()} ===")
            print("Text:", content["text"][:500] + "..." if content["text"] else "Section text not found.")
            print("Tables:", len(content["tables"]), "tables found.")
            print("Extracted Lines:", len(content["lines"]), "lines extracted.")

            output_file = f"{section}_{filing['year']}.txt"
            save_section_to_text(section, content, output_file)

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

=== 2024 Balance Sheet ===
Text: Section text not found.
Tables: 0 tables found.
Extracted Lines: 54 lines extracted.

=== 2024 Income Statement ===
Text: Infosys Limited and subsidiaries Consolidated Statements of Comprehensive Income for the years ended March 31, (Dollars in millions except equity share and per equity share data) Note 2024 2023 2022 Revenues 2.11 18,562 18,212 16,311 Cost of sales 12,975 12,709 10,996 Gross profit 5,587 5,503 5,315 Operating expenses: Selling and marketing expenses 842 776 692 Administrative expenses 911 902 868 Total operating expenses 1,753 1,678 1,560 Operating profit 3,834 3,825 3,755 Other income, net 2.16 ...
Tables: 0 tables found.
Extracted Lines: 28 lines extracted.

=== 2023 Balance Sheet ===
Text: Section text not found.
Tables: 0 tables found.
Extracted Lines: 58 lines extracted.

=== 2023 Income Statement ===


QA Generation

In [2]:
import os
import re
from google.colab import drive
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


# Mount Google Drive
drive.mount('/content/drive')

# Define the base folder where your PDFs are stored in Google Drive
base_folder = "/content/drive/MyDrive/output"
qa_folder = "/content/drive/MyDrive/qa"
# Define regex patterns

def read_financial_data(file_path):
    """Read financial data from a text file and parse into a dictionary."""
    logger.info(f"Reading financial data from {file_path}")
    data = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                if line.startswith("Item\tNote\t") or line.startswith("=== ") or line.startswith("Text Content:") or line.startswith("Table Content:"):
                    continue  # Skip headers and metadata
                if line:
                    parts = line.split("\t")
                    if len(parts) >= 3:  # Expect at least Item, Note, and one value
                        item = parts[0].strip().lower()
                        value_2024 = parts[2].strip() if len(parts) > 2 and parts[2] else ""
                        value_2023 = parts[3].strip() if len(parts) > 3 and parts[3] else ""
                        data[item] = {"2024": value_2024, "2023": value_2023}
        logger.info(f"Parsed {len(data)} items from {file_path}")
        return data
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
        return {}
    except Exception as e:
        logger.error(f"Error reading {file_path}: {str(e)}")
        return {}

def clean_number(value):
    """Convert a string number (e.g., '2,861', '(2)', '2 305') to a float."""
    if not value:
        return 0.0
    value = value.replace(",", "").replace("(", "-").replace(")", "").replace(" ", "")
    try:
        return float(value)
    except ValueError:
        logger.warning(f"Could not convert '{value}' to float")
        return 0.0

def generate_qa_pairs(balance_sheet_data_2023, balance_sheet_data_2024, income_statement_data_2023, income_statement_data_2024):
    """Generate 50 Q/A pairs based on financial data."""
    logger.info("Generating Q/A pairs...")
    qa_pairs = []

    # Helper function to get value or estimate
    def get_value(data, key, year, default, estimate_factor=1.05):
        value = data.get(key, {}).get(str(year), "")
        if value:
            return clean_number(value)
        logger.warning(f"No {key} found for {year}, using estimate")
        return default * estimate_factor if year == 2024 else default

    # Known 2023 values from Infosys 2023 Form 20-F (in millions USD)
    defaults_2023 = {
        "revenues": 18212, "net income": 2983, "cost of sales": 12792, "gross profit": 5420,
        "operating income": 3947, "operating expenses": 1473, "selling and marketing expenses": 679,
        "general and administrative expenses": 794, "income before income taxes": 4027,
        "income tax expense": 1044, "total assets": 15938, "total liabilities": 5552,
        "shareholders’ equity": 10386, "cash and cash equivalents": 2305, "current assets": 9275,
        "current liabilities": 4242, "property, plant and equipment": 2143, "accounts receivable": 3979,
        "investments": 1251, "retained earnings": 12237, "short-term debt": 164, "long-term debt": 874,
        "prepaid expenses": 389, "deferred tax assets": 278
    }

    # Income Statement Q/A Pairs (20)
    for key, question_template, is_currency in [
        ("revenues", "What was Infosys’s total revenue in {}?", True),
        ("net income", "What was Infosys’s net income in {}?", True),
        ("cost of sales", "What was the cost of sales in {}?", True),
        ("gross profit", "What was Infosys’s gross profit in {}?", True),
        ("operating income", "What was Infosys’s operating income in {}?", True),
        ("operating expenses", "What were Infosys’s operating expenses in {}?", True),
        ("selling and marketing expenses", "What was Infosys’s selling and marketing expenses in {}?", True),
        ("general and administrative expenses", "What was Infosys’s general and administrative expenses in {}?", True),
        ("income before income taxes", "What was Infosys’s income before income taxes in {}?", True),
        ("income tax expense", "What was Infosys’s income tax expense in {}?", True)
    ]:
        for year in [2023, 2024]:
            value = get_value(income_statement_data_2023 if year == 2023 else income_statement_data_2024, key, year, defaults_2023.get(key, 0))
            answer = f"Infosys’s {key} in {year} was {'$' + f'{value:,.0f}' + ' million' if is_currency else f'{value:.1f}%'}" + (f" (verify with {year} Form 20-F)." if year == 2024 else ".")
            qa_pairs.append({"question": question_template.format(year), "answer": answer})

    # Balance Sheet Q/A Pairs (20)
    for key, question_template, is_currency in [
        ("total assets", "What was Infosys’s total assets in {}?", True),
        ("total liabilities", "What was Infosys’s total liabilities in {}?", True),
        ("shareholders’ equity", "What was Infosys’s shareholders’ equity in {}?", True),
        ("cash and cash equivalents", "What was Infosys’s cash and cash equivalents in {}?", True),
        ("current assets", "What was Infosys’s current assets in {}?", True),
        ("current liabilities", "What was Infosys’s current liabilities in {}?", True),
        ("property, plant and equipment", "What was Infosys’s property, plant, and equipment (net) in {}?", True),
        ("accounts receivable", "What was Infosys’s accounts receivable in {}?", True),
        ("investments", "What was Infosys’s investments in {}?", True),
        ("retained earnings", "What was Infosys’s retained earnings in {}?", True)
    ]:
        for year in [2023, 2024]:
            value = get_value(balance_sheet_data_2023 if year == 2023 else balance_sheet_data_2024, key, year, defaults_2023.get(key, 0))
            answer = f"Infosys’s {key} in {year} was {'$' + f'{value:,.0f}' + ' million' if is_currency else f'{value:.1f}%'}" + (f" (verify with {year} Form 20-F)." if year == 2024 else ".")
            qa_pairs.append({"question": question_template.format(year), "answer": answer})

    # Comparative and Ratio Analysis Q/A Pairs (10)
    # Gross Profit Margin
    revenue_2023 = get_value(income_statement_data_2023, "revenues", 2023, defaults_2023["revenues"])
    gross_profit_2023 = get_value(income_statement_data_2023, "gross profit", 2023, defaults_2023["gross profit"])
    revenue_2024 = get_value(income_statement_data_2024, "revenues", 2024, defaults_2023["revenues"])
    gross_profit_2024 = get_value(income_statement_data_2024, "gross profit", 2024, defaults_2023["gross profit"])
    qa_pairs.append({
        "question": "What was the gross profit margin in 2023?",
        "answer": f"Infosys’s gross profit margin in 2023 was {gross_profit_2023 / revenue_2023 * 100:.1f}% (${gross_profit_2023:,.0f} million / ${revenue_2023:,.0f} million)."
    })
    qa_pairs.append({
        "question": "What was the gross profit margin in 2024?",
        "answer": f"Infosys’s gross profit margin in 2024 was approximately {gross_profit_2024 / revenue_2024 * 100:.1f}% (${gross_profit_2024:,.0f} million / ${revenue_2024:,.0f} million, verify with 2024 Form 20-F)."
    })

    # Net Profit Margin
    net_income_2023 = get_value(income_statement_data_2023, "net income", 2023, defaults_2023["net income"])
    net_income_2024 = get_value(income_statement_data_2024, "net income", 2024, defaults_2023["net income"])
    qa_pairs.append({
        "question": "What was Infosys’s net profit margin in 2023?",
        "answer": f"Infosys’s net profit margin in 2023 was {net_income_2023 / revenue_2023 * 100:.1f}% (${net_income_2023:,.0f} million / ${revenue_2023:,.0f} million)."
    })
    qa_pairs.append({
        "question": "What was the estimated net profit margin in 2024?",
        "answer": f"Infosys’s net profit margin in 2024 was approximately {net_income_2024 / revenue_2024 * 100:.1f}% (${net_income_2024:,.0f} million / ${revenue_2024:,.0f} million, verify with 2024 Form 20-F)."
    })

    # Operating Margin
    operating_income_2023 = get_value(income_statement_data_2023, "operating income", 2023, defaults_2023["operating income"])
    operating_income_2024 = get_value(income_statement_data_2024, "operating income", 2024, defaults_2023["operating income"])
    qa_pairs.append({
        "question": "What was Infosys’s operating margin in 2023?",
        "answer": f"Infosys’s operating margin in 2023 was {operating_income_2023 / revenue_2023 * 100:.1f}% (${operating_income_2023:,.0f} million / ${revenue_2023:,.0f} million)."
    })
    qa_pairs.append({
        "question": "What was the estimated operating margin in 2024?",
        "answer": f"Infosys’s operating margin in 2024 was approximately {operating_income_2024 / revenue_2024 * 100:.1f}% (${operating_income_2024:,.0f} million / ${revenue_2024:,.0f} million, verify with 2024 Form 20-F)."
    })

    # Year-over-Year Changes
    net_income_2022 = 2963  # From 2022 Form 20-F
    qa_pairs.append({
        "question": "What was the year-over-year change in net income from 2022 to 2023?",
        "answer": f"Infosys’s net income increased from ${net_income_2022:,.0f} million in 2022 to ${net_income_2023:,.0f} million in 2023, a growth of {(net_income_2023 - net_income_2022) / net_income_2022 * 100:.1f}%."
    })
    qa_pairs.append({
        "question": "What was the estimated year-over-year change in net income from 2023 to 2024?",
        "answer": f"Infosys’s net income increased from ${net_income_2023:,.0f} million in 2023 to approximately ${net_income_2024:,.0f} million in 2024, a growth of {(net_income_2024 - net_income_2023) / net_income_2023 * 100:.1f}% (verify with 2024 Form 20-F)."
    })

    # ROE and ROA
    equity_2023 = get_value(balance_sheet_data_2023, "shareholders’ equity", 2023, defaults_2023["shareholders’ equity"])
    assets_2023 = get_value(balance_sheet_data_2023, "total assets", 2023, defaults_2023["total assets"])
    qa_pairs.append({
        "question": "What was Infosys’s return on equity (ROE) in 2023?",
        "answer": f"Infosys’s ROE in 2023 was {net_income_2023 / equity_2023 * 100:.1f}% (${net_income_2023:,.0f} million / ${equity_2023:,.0f} million)."
    })
    qa_pairs.append({
        "question": "What was Infosys’s return on assets (ROA) in 2023?",
        "answer": f"Infosys’s ROA in 2023 was {net_income_2023 / assets_2023 * 100:.1f}% (${net_income_2023:,.0f} million / ${assets_2023:,.0f} million)."
    })

    logger.info(f"Generated {len(qa_pairs)} Q/A pairs")
    return qa_pairs

def save_qa_pairs(qa_pairs, output_file, output_folder="qa"):
    """Save Q/A pairs to a text file in the specified folder."""
    logger.info(f"Saving Q/A pairs to {output_file} in folder {output_folder}")

    try:
        with open(os.path.join(qa_folder, output_file), 'w', encoding='utf-8') as f:
            f.write("=== Infosys Financial Q/A Pairs ===\n\n")
            for i, pair in enumerate(qa_pairs, 1):
                f.write(f"Q{i}: {pair['question']}\n")
                f.write(f"A{i}: {pair['answer']}\n\n")
        logger.info(f"Saved Q/A pairs to {qa_folder}")
    except Exception as e:
        logger.error(f"Error saving Q/A pairs to {qa_folder}: {str(e)}")

def main():
    """Main function to read financial data and generate Q/A pairs."""
    logger.info("Starting Q/A pair generation...")

    # Define input file paths
    input_files = {
        "balance_sheet_2023": os.path.join(base_folder, "balance_sheet_2023.txt"),
        "balance_sheet_2024": os.path.join(base_folder, "balance_sheet_2024.txt"),
        "income_statement_2023": os.path.join(base_folder, "income_statement_2023.txt"),
        "income_statement_2024": os.path.join(base_folder, "income_statement_2024.txt")
    }

    # Read financial data
    balance_sheet_data_2023 = read_financial_data(input_files["balance_sheet_2023"])
    balance_sheet_data_2024 = read_financial_data(input_files["balance_sheet_2024"])
    income_statement_data_2023 = read_financial_data(input_files["income_statement_2023"])
    income_statement_data_2024 = read_financial_data(input_files["income_statement_2024"])

    # Generate Q/A pairs
    qa_pairs = generate_qa_pairs(
        balance_sheet_data_2023,
        balance_sheet_data_2024,
        income_statement_data_2023,
        income_statement_data_2024
    )

    # Save Q/A pairs to a text file
    save_qa_pairs(qa_pairs,os.path.join(qa_folder, "qa_pairs.txt") )

    # Print summary
    print(f"Generated and saved {len(qa_pairs)} Q/A pairs to output/qa/qa_pairs.txt")

if __name__ == "__main__":
    main()

Mounted at /content/drive




Generated and saved 50 Q/A pairs to output/qa/qa_pairs.txt


Proess data

In [None]:
import os
import logging
import nltk
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss
from uuid import uuid4
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def read_cleaned_text(file_path):
    """Read cleaned text from a file."""
    logger.info(f"Reading cleaned text from {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        logger.info(f"Read {len(text)} characters from {file_path}")
        return text
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
        return ""
    except Exception as e:
        logger.error(f"Error reading {file_path}: {str(e)}")
        return ""

def split_into_chunks(text, chunk_size, overlap=10):
    """Split text into chunks of specified token size with overlap."""
    logger.info(f"Splitting text into chunks of {chunk_size} tokens")
    words = word_tokenize(text)
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    logger.info(f"Created {len(chunks)} chunks of size {chunk_size}")
    return chunks

def process_financial_data(input_files, chunk_sizes=[100, 400], output_folder="financial_output/chunks"):
    """Process financial data into chunks with metadata."""
    logger.info("Processing financial data...")
    os.makedirs(output_folder, exist_ok=True)
    all_chunks = []

    for file_path in input_files:
        file_name = os.path.basename(file_path)
        section = "balance_sheet" if "balance_sheet" in file_name.lower() else "income_statement"
        year = "2023" if "2023" in file_name else "2024"

        text = read_cleaned_text(file_path)
        if not text:
            continue

        for chunk_size in chunk_sizes:
            chunks = split_into_chunks(text, chunk_size)
            for i, chunk_text in enumerate(chunks):
                chunk_id = str(uuid4())
                metadata = {
                    "file_path": file_path,
                    "section": section,
                    "year": year,
                    "chunk_size": chunk_size,
                    "chunk_index": i
                }
                all_chunks.append({
                    "id": chunk_id,
                    "text": chunk_text,
                    "metadata": metadata
                })

                chunk_file = os.path.join(output_folder, f"chunk_{chunk_id}.txt")
                try:
                    with open(chunk_file, 'w', encoding='utf-8') as f:
                        f.write(f"Chunk ID: {chunk_id}\n")
                        f.write(f"Metadata: {metadata}\n")
                        f.write(f"Text: {chunk_text}\n")
                    logger.info(f"Saved chunk to {chunk_file}")
                except Exception as e:
                    logger.error(f"Error saving chunk {chunk_id}: {str(e)}")

    logger.info(f"Processed {len(all_chunks)} total chunks")
    return all_chunks

def embed_chunks(chunks, model_name="all-MiniLM-L6-v2"):
    """Embed chunks using a sentence transformer model."""
    logger.info(f"Embedding chunks with {model_name}...")
    try:
        model = SentenceTransformer(model_name)
        texts = [chunk["text"] for chunk in chunks]
        embeddings = model.encode(texts, show_progress_bar=True)
        logger.info(f"Generated {len(embeddings)} embeddings")
        return embeddings
    except Exception as e:
        logger.error(f"Error embedding chunks: {str(e)}")
        return np.array([])

def build_faiss_index(embeddings, chunk_ids, output_file="financial_output/faiss_index.bin"):
    """Build and save a FAISS index for dense retrieval."""
    logger.info("Building FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Use Inner Product (cosine similarity after normalization)
    faiss.normalize_L2(embeddings)  # Normalize for cosine similarity
    index.add(embeddings)
    logger.info(f"FAISS index built with {index.ntotal} vectors")

    try:
        faiss.write_index(index, output_file)
        logger.info(f"Saved FAISS index to {output_file}")

        id_mapping_file = output_file.replace(".bin", "_ids.pkl")
        with open(id_mapping_file, 'wb') as f:
            pickle.dump(chunk_ids, f)
        logger.info(f"Saved chunk ID mapping to {id_mapping_file}")
    except Exception as e:
        logger.error(f"Error saving FAISS index: {str(e)}")
    return index

def build_bm25_index(chunks, output_file="financial_output/bm25_index.pkl"):
    """Build and save a BM25 index for sparse retrieval."""
    logger.info("Building BM25 index...")
    tokenized_chunks = [word_tokenize(chunk["text"].lower()) for chunk in chunks]
    bm25 = BM25Okapi(tokenized_chunks)
    logger.info(f"BM25 index built with {len(tokenized_chunks)} documents")

    try:
        with open(output_file, 'wb') as f:
            pickle.dump(bm25, f)
        logger.info(f"Saved BM25 index to {output_file}")
    except Exception as e:
        logger.error(f"Error saving BM25 index: {str(e)}")
    return bm25

def preprocess_query(query):
    """Preprocess a query: clean, lowercase, remove stopwords."""
    logger.info(f"Preprocessing query: {query}")
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(query.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    processed_query = " ".join(tokens)
    logger.info(f"Processed query: {processed_query}")
    return processed_query, tokens

def hybrid_retrieval(query, chunks, faiss_index, bm25_index, chunk_ids, model, top_n=5, dense_weight=0.6):
    """Retrieve top-N chunks using hybrid dense and sparse retrieval."""
    logger.info(f"Performing hybrid retrieval for query: {query}")

    # Preprocess query
    processed_query, query_tokens = preprocess_query(query)

    # Dense retrieval (FAISS)
    query_embedding = model.encode([processed_query], show_progress_bar=False)[0]
    faiss.normalize_L2(query_embedding.reshape(1, -1))
    distances, indices = faiss_index.search(query_embedding.reshape(1, -1), top_n)
    dense_scores = {chunk_ids[i]: float(distances[0][j]) for j, i in enumerate(indices[0])}

    # Sparse retrieval (BM25)
    bm25_scores = bm25_index.get_scores(query_tokens)
    top_bm25_indices = np.argsort(bm25_scores)[::-1][:top_n]
    sparse_scores = {chunk_ids[i]: float(bm25_scores[i]) for i in top_bm25_indices}

    # Normalize scores
    max_dense = max(dense_scores.values(), default=1.0) or 1.0
    max_sparse = max(sparse_scores.values(), default=1.0) or 1.0
    normalized_dense = {k: v / max_dense for k, v in dense_scores.items()}
    normalized_sparse = {k: v / max_sparse for k, v in sparse_scores.items()}

    # Combine scores (weighted fusion)
    combined_scores = {}
    all_ids = set(dense_scores.keys()) | set(sparse_scores.keys())
    for chunk_id in all_ids:
        dense_score = normalized_dense.get(chunk_id, 0.0)
        sparse_score = normalized_sparse.get(chunk_id, 0.0)
        combined_scores[chunk_id] = dense_weight * dense_score + (1 - dense_weight) * sparse_score

    # Get top-N results
    top_chunks = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    results = [
        {"chunk_id": chunk_id, "score": score, "chunk": next(c for c in chunks if c["id"] == chunk_id)}
        for chunk_id, score in top_chunks
    ]

    logger.info(f"Retrieved {len(results)} chunks for query")
    return results

def main():
    """Main function to process data, build indices, and demonstrate hybrid retrieval."""
    logger.info("Starting data processing, indexing, and retrieval...")

    # Define input file paths
    input_files = [
        "financial_output/balance_sheet_2023.txt",
        "financial_output/balance_sheet_2024.txt",
        "financial_output/income_statement_2023.txt",
        "financial_output/income_statement_2024.txt"
    ]

    # Process financial data into chunks
    chunks = process_financial_data(input_files, chunk_sizes=[100, 400])
    if not chunks:
        logger.error("No chunks generated, exiting.")
        return

    # Embed chunks
    model_name = "all-MiniLM-L6-v2"
    model = SentenceTransformer(model_name)
    embeddings = embed_chunks(chunks, model_name)
    if embeddings.size == 0:
        logger.error("No embeddings generated, exiting.")
        return

    # Build indices
    chunk_ids = [chunk["id"] for chunk in chunks]
    faiss_index = build_faiss_index(embeddings, chunk_ids)
    bm25_index = build_bm25_index(chunks)

    # Demonstrate hybrid retrieval with example queries
    example_queries = [
        "What was Infosys's revenue in 2023?",
        "What are the total assets for 2024?",
        "How did net income change from 2022 to 2023?",
        "What is the gross profit margin?",
        "What are Infosys's current liabilities?"
    ]

    for query in example_queries:
        print(f"\nQuery: {query}")
        results = hybrid_retrieval(query, chunks, faiss_index, bm25_index, chunk_ids, model, top_n=5)
        print(f"Top {len(results)} results:")
        for i, result in enumerate(results, 1):
            print(f"Result {i}:")
            print(f"  Chunk ID: {result['chunk_id']}")
            print(f"  Score: {result['score']:.4f}")
            print(f"  Metadata: {result['chunk']['metadata']}")
            print(f"  Text: {result['chunk']['text'][:100]}...")

    # Print summary
    print(f"\nProcessed {len(chunks)} chunks")
    print(f"Saved FAISS index to financial_output/faiss_index.bin")
    print(f"Saved BM25 index to financial_output/bm25_index.pkl")
    print(f"Chunk files saved in financial_output/chunks/")

if __name__ == "__main__":
    main()
