## Imports and Utility Functions

In [137]:
import re
import json
from pathlib import Path
import pandas as pd # For potential table handling later
from markdown_it import MarkdownIt # For Markdown parsing

# --- Configuration ---
BASE_OUTPUT_DIR = Path("/Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs")
JSON_OUTPUT_SUBDIR = "hybrid_json_outputs"
JSON_OUTPUT_DIR = BASE_OUTPUT_DIR / JSON_OUTPUT_SUBDIR
JSON_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Utility Functions ---

def find_contract_files(base_dir):
    """
    Finds all .md contract files within '...(Contract)' subdirectories.
    Yields Path objects to the .md files.
    """
    print(f"Searching for contract folders in: {base_dir.resolve()}")
    found_any = False
    for item in base_dir.iterdir():
        if item.is_dir() and '(Contract)' in item.name:
            contract_folder = item
            md_files = list(contract_folder.glob('*.md'))
            if md_files:
                if len(md_files) > 1:
                    print(f"  Warning: Multiple .md files in {contract_folder.name}. Using first: {md_files[0].name}")
                yield md_files[0] # Yield the first .md file found
                found_any = True
            else:
                print(f"  Warning: No .md file found in {contract_folder.name}")
    if not found_any:
        print("  No contract folders with .md files found.")

def read_markdown_file(md_filepath):
    """Reads content from a markdown file."""
    try:
        with open(md_filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        print(f"  Successfully read {len(content)} characters from {md_filepath.name}.")
        return content
    except Exception as e:
        print(f"  Error reading file {md_filepath.name}: {e}")
        return None

In [139]:
import re
import calendar 
from collections import defaultdict

def extract_rates_and_unit(sections, tables, contract_data_dict):
    notes = contract_data_dict.get("_notes", [])
    rates = []
    primary_unit = None 

    # --- Consolidate Search Logic ---
    search_sources_for_rates = [] # This will be a list of (name, content) tuples
    
    # Use a dictionary to add sections by name to avoid duplicates and maintain some order
    sections_to_search_map = {}

    # 1. Prioritize specifically named sections
    ordered_section_keywords = ["contract price", "purchase price", "pricing", "rate"] 
    for keyword in ordered_section_keywords:
        for section_name, section_content in sections.items():
            if keyword in section_name.lower():
                if section_name not in sections_to_search_map: 
                    sections_to_search_map[section_name] = section_content
                    print(f"  Rates: Added priority section '{section_name}' for rates.")
    
    # 2. If no highly specific sections found, or to broaden, check content of other sections
    if not sections_to_search_map or not any(kw in name.lower() for name in sections_to_search_map for kw in ["contract price", "purchase price"]):
        print("  Rates: Broadening search or no primary rate sections found by name. Checking section content.")
        for section_name, section_content in sections.items():
            if section_name not in sections_to_search_map: # Only check if not already prioritized
                if any(key_term in section_content.lower() for key_term in ["price is", "price:", " rate:", "fixed at", "$/", "¢/", " per "]):
                    sections_to_search_map[section_name] = section_content
                    print(f"  Rates: Added section '{section_name}' based on content clues.")

    # Convert map to the list of tuples for iteration
    if sections_to_search_map:
        for name, content in sections_to_search_map.items():
            search_sources_for_rates.append((name, content))
    else: # search all sections if no candidates found yet
        print("  Rates: No specific or content-cued sections for rates. Searching all available section content.")
        for name, content in sections.items():
            search_sources_for_rates.append((name, content))
    
    if not search_sources_for_rates and markdown_content: # Should not happen if sections exist
         print("  Rates: No section content to search. This is unexpected.")
         search_sources_for_rates.append(("markdown_content_fallback", markdown_content))


    # --- Regex Patterns ---
    rate_pattern_general = re.compile(
        r"^(?:[\s\-*]*)"
        r"(?P<tier_prefix>(?:\*{0,2}Tier\s+\w+\*{0,2}\s*:\s*)?)" 
        r"\*{0,2}(Fixed Price|Contract price|Index Price)\*{0,2}" 
        r"\s*(?:\((?P<currency_in_label>[\$¢])?\s*/\s*(?P<unit_in_label>\w+)\))?" 
        r"\*{0,2}\s*[:\-]\s*" 
        r"(?P<description_before_math>.*?)" 
        r"(?P<math_operator>[+\-])?\s*" 
        r"(?P<currency_before_value>[\$¢])?" 
        r"\s*(?P<value>[\d.,]+)" 
        r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?"
        r"(?:\s*\((?P<unit_in_paren_after_value>\w+)\))?",
        re.IGNORECASE | re.MULTILINE
    )

    cents_rate_pattern = re.compile(
        r"(?P<value>[\d.,]+)\s+(?:¢|cents)\s*(?:per|/)\s*(?P<unit>KWh|MMBTU|Dth|Therm)\b",
        re.IGNORECASE | re.MULTILINE
    )
    
    unit_context_pattern = re.compile(
        r"(?:Volume|Quantity)\s+(?:per\s+Month\s+)?in\s+(?P<unit>Dths|MMBTUs|kWh|Therms)\b",
        re.IGNORECASE
    )

    processed_rate_matches = set()

    # --- Main Regex Loop ---
    for section_name_for_debug, text_block in search_sources_for_rates:
        for match in rate_pattern_general.finditer(text_block):
            match_text = match.group(0).strip()
            if match_text in processed_rate_matches: continue
            processed_rate_matches.add(match_text)   
            print(f"\nDEBUG MATCH (In Section: '{section_name_for_debug}'):")
            print(f"  Line Matched by Regex: '{match_text}'")
            print(f"  Iterating over groups for this match:")
            for group_name_val in rate_pattern_general.groupindex.keys():
                try:
                    group_content = match.group(group_name_val)
                    print(f"    Group ({group_name_val}): '{group_content}'")
                except IndexError: 
                    print(f"    Group ({group_name_val}): DID NOT PARTICIPATE (IndexError)") 
            print("--- End of Groups for this Match ---")

            rate_value_str = match.group("value").replace(',', '').strip()
            if not rate_value_str: 
                continue

            tier_name_full = match.group("tier_prefix")
            tier_name = None
            if tier_name_full:
                tier_match_extract = re.search(r"Tier\s+\w+", tier_name_full, re.IGNORECASE)
                if tier_match_extract:
                    tier_name = tier_match_extract.group(0).strip()

            current_rate_unit = None
            current_currency = None

            # Determine Unit and Currency from different parts of the match
            if match.group("unit_in_label"):
                current_rate_unit = match.group("unit_in_label").upper()
                if match.group("currency_in_label"): # Currency found within the ($/kwh) part
                    current_currency = match.group("currency_in_label")
            elif match.group("unit_after_value"):
                current_rate_unit = match.group("unit_after_value").upper()
            elif match.group("unit_in_paren_after_value"):
                current_rate_unit = match.group("unit_in_paren_after_value").upper()

            # If currency not found in label, check before value
            if not current_currency and match.group("currency_before_value"):
                current_currency = match.group("currency_before_value")
            
            if not current_currency: current_currency = '$' # Default if none found

            if not current_rate_unit:
                print(f"    Rate value '{rate_value_str}' found, but unit not directly parsed with it. Line: '{match_text}'. Description: '{match.group('description_before_math')}' Skipping.")
                notes.append(f"Rate value '{rate_value_str}' (in section '{section_name_for_debug}') found but no unit directly parsed; skipped.")
                continue

            # Standardize unit
            std_unit = current_rate_unit
            if current_rate_unit == "DTHS": std_unit = "DTH"
            elif current_rate_unit == "MMBTUS": std_unit = "MMBTU"
            elif current_rate_unit == "KWH": std_unit = "KWH" 
            elif current_rate_unit == "THERMS": std_unit = "THERM"
            current_rate_unit = std_unit


            try:
                rate_value_float = float(rate_value_str)
                rate_display_unit = f"{current_currency}/{current_rate_unit}"

                if current_currency == '¢':
                    rate_value_float /= 100.0
                    rate_display_unit = f"$/{current_rate_unit}"
                
                rate_entry = {
                    "rate": f"{rate_value_float:.5f}" if current_currency == '¢' else str(rate_value_float),
                    "unit_full_string": rate_display_unit,
                    "base_unit": current_rate_unit
                }
                if tier_name: rate_entry["tier"] = tier_name
                
                if rate_entry not in rates:
                    rates.append(rate_entry)
                    print(f"    Found Rate: {(tier_name + ': ') if tier_name else ''}{rate_entry['rate']} {rate_entry['unit_full_string']}")
                    if not primary_unit: primary_unit = current_rate_unit
            except ValueError:
                notes.append(f"Could not parse rate value '{rate_value_str}' for unit '{current_rate_unit}'.")
                print(f"    Warning: Could not parse rate value '{rate_value_str}' from: {match_text}")
            except Exception as e:
                print(f"    Error processing rate: {e} from: {match_text}")

        # Try Cents Pattern
        for match in cents_rate_pattern.finditer(text_block):
            match_text = match.group(0).strip()
            is_duplicate_from_general = False
            rate_value_str_cents = match.group("value").replace(',', '').strip()
            unit_cents = match.group("unit").upper() 
            try:
                val_cents_float = float(rate_value_str_cents) / 100.0
                for r_existing in rates: 
                    if r_existing["base_unit"] == unit_cents and \
                       abs(float(r_existing["rate"]) - val_cents_float) < 0.00001:
                        is_duplicate_from_general = True; break
            except: pass 

            if is_duplicate_from_general : continue

            current_rate_unit_cents = unit_cents
            if current_rate_unit_cents == "DTHS": current_rate_unit_cents = "DTH" # etc.

            try:
                rate_value_cents_float = float(rate_value_str_cents)
                rate_value_dollars = rate_value_cents_float / 100.0
                rate_entry = {
                    "rate": f"{rate_value_dollars:.5f}",
                    "unit_full_string": f"$/{current_rate_unit_cents}",
                    "base_unit": current_rate_unit_cents
                }
                if rate_entry not in rates: 
                    rates.append(rate_entry)
                    print(f"    Found Cents Rate: {rate_value_cents_float} cents/{current_rate_unit_cents} -> {rate_entry['rate']} {rate_entry['unit_full_string']}")
                    if not primary_unit: primary_unit = current_rate_unit_cents
            except ValueError:
                notes.append(f"Could not parse cents rate '{rate_value_str_cents}'.")
                print(f"    Warning: Could not parse cents rate '{rate_value_str_cents}' from: {match_text}")

    # --- Infer Primary Unit of Measurement if not set by rates ---
    if not primary_unit:
        all_text_for_unit_context = " ".join(sections.values()) 
        context_match = unit_context_pattern.search(all_text_for_unit_context)
        if context_match:
            unit_from_context = context_match.group("unit").upper()
            if unit_from_context in ["DTHS", "DTH"]: primary_unit = "DTH"
            elif unit_from_context == "MMBTUS": primary_unit = "MMBTU"
            elif unit_from_context == "KWH": primary_unit = "KWH"
            elif unit_from_context == "THERMS": primary_unit = "THERM"
            if primary_unit:
                notes.append(f"Primary UoM '{primary_unit}' inferred from document context.")
                print(f"    Inferred Primary UoM from context: {primary_unit}")

    if not primary_unit:
        all_text_content = " ".join(sections.values()).upper()
        if "ELECTRIC" in all_text_content: primary_unit = "KWH"; notes.append("UoM 'KWH' from 'ELECTRIC' keyword.")
        elif "GAS" in all_text_content: primary_unit = "MMBTU"; notes.append("UoM 'MMBTU' from 'GAS' keyword.")
        if primary_unit: print(f"    Inferred Primary UoM by keyword: {primary_unit}")
    
    if not rates:
        notes.append("No contracted rate information found.")
        print("    Warning: No contracted rates found.")
    
    contract_data_dict['contracted_rates'] = rates
    contract_data_dict['unit_of_measurement'] = primary_unit
    contract_data_dict['_notes'] = notes

In [141]:
from dateutil.parser import parse as parse_date_flexible
from dateutil.relativedelta import relativedelta

def normalize_date_robust(date_str):
    """Attempts to parse a date string into YYYY-MM-DD, handles more cases."""
    if not date_str or not isinstance(date_str, str):
        return None
    try:
        date_str_cleaned = re.sub(r"(\d+)(?:st|nd|rd|th)\s+day\s+of\s+", "", date_str, flags=re.IGNORECASE)
        dt = parse_date_flexible(date_str_cleaned)
        return dt.strftime('%Y-%m-%d')
    except (ValueError, TypeError, OverflowError) as e:
        print(f"    Debug: normalize_date_robust failed for '{date_str}': {e}")
        return None

def calculate_end_date_from_term(start_date_str, months_str):
    """Calculates end date from start date and term in months."""
    if not start_date_str or not months_str:
        return None
    try:
        start_date = parse_date_flexible(start_date_str)
        months = int(months_str)
        end_date = start_date + relativedelta(months=months) - relativedelta(days=1) 
        return end_date.strftime('%Y-%m-%d')
    except (ValueError, TypeError):
        return None

# --- New Extraction Function ---
def extract_contract_duration(sections, tables, contract_data_dict):
    """
    Extracts contract start and end dates.
    Populates contract_data_dict['contract_duration'] and contract_data_dict['_notes'].
    """
    notes = contract_data_dict.get("_notes", [])
    start_date = None
    end_date = None
    term_months_val = None

    # --- Define Search Space ---
    # Prioritize specific sections if they exist, otherwise search all section content
    search_texts = []    
    term_section_keys = [k for k in sections if "term" in k.lower() and "termination" not in k.lower()] # Avoid "Early Termination"
    delivery_period_keys = [k for k in sections if "delivery period" in k.lower()]
    contract_info_keys = [k for k in sections if "contract information" in k.lower()]
    
    if delivery_period_keys:
        search_texts.append(sections[delivery_period_keys[0]])
        print(f"  Duration: Searching in '{delivery_period_keys[0]}' section.")
    if term_section_keys:
        search_texts.append(sections[term_section_keys[0]])
        print(f"  Duration: Searching in '{term_section_keys[0]}' section.")
    if contract_info_keys:
        search_texts.append(sections[contract_info_keys[0]])
        print(f"  Duration: Searching in '{contract_info_keys[0]}' section.")
    
    # Fallback to searching all section content if specific sections aren't found or don't yield results
    if not search_texts:
        print("  Duration: No specific duration sections found, searching all section content.")
        search_texts.extend(sections.values()) # Search text of all sections

    # Regex for dates
    date_val_pattern = r"([\w\s,]+\d{1,2}[,\s]+(?:19|20)\d{2}|\d{1,2}[-/]\d{1,2}[-/](?:19|20)?\d{2})"
    date_val_pattern matches: "Month Day, Year", "Day Month, Year", "MM/DD/YYYY", "MM-DD-YY", etc.

    start_patterns = [
        re.compile(rf"Begin\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"Start Date\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"Effective\s*(?:the date of this contract or date of first gas deliveries available thereafter)?\s*(?:for|on)?\s*{date_val_pattern}", re.IGNORECASE), # Example 1 "effective the date..."
    ]
    end_patterns = [
        re.compile(rf"End\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"End Date\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
    ]
    term_patterns = [
        re.compile(r"for\s+(\d+)\s+months", re.IGNORECASE), # Example 1 "for 36 months"
        re.compile(r"Term.*?for\s+(\d+)\s+months", re.IGNORECASE | re.DOTALL),
    ]
    executed_date_pattern = re.compile(rf"Executed this\s+{date_val_pattern}", re.IGNORECASE) # Example 1

    # --- Extraction Logic ---
    for text_block in search_texts:
        if not start_date:
            for pattern in start_patterns:
                match = pattern.search(text_block)
                if match:
                    start_date = normalize_date_robust(match.group(1)) # Group 1 is date_val_pattern
                    if start_date: break 
            if start_date: print(f"    Start Date found: {start_date}")
        
        if not end_date:
            for pattern in end_patterns:
                match = pattern.search(text_block)
                if match:
                    end_date = normalize_date_robust(match.group(1))
                    if end_date: break
            if end_date: print(f"    End Date found: {end_date}")

        if not term_months_val: # Only look for term if end_date isn't directly found
            for pattern in term_patterns:
                match = pattern.search(text_block)
                if match:
                    term_months_val = match.group(1)
                    if term_months_val: break
            if term_months_val: print(f"    Term (months) found: {term_months_val}")
        
        if start_date and end_date: # Stop if both found
            break
    
    if not start_date:
        for text_block in search_texts: # Re-search if not found in primary search
            match = executed_date_pattern.search(text_block)
            if match:
                start_date = normalize_date_robust(match.group(1))
                if start_date:
                    print(f"    Start Date (from Executed this): {start_date}")
                    break
    
    # Calculate end_date if start_date and term_months are found but no end_date
    if start_date and term_months_val and not end_date:
        end_date = calculate_end_date_from_term(start_date, term_months_val)
        if end_date:
            print(f"    End Date (calculated from term): {end_date}")
            notes.append(f"End date calculated from {term_months_val}-month term.")
        else:
            notes.append(f"Found start date and term ({term_months_val} months), but failed to calculate end date.")

    # Update contract_data_dict
    contract_data_dict['contract_duration']['start_date'] = start_date
    contract_data_dict['contract_duration']['end_date'] = end_date

    if not start_date and not end_date and not term_months_val:
        notes.append("Contract duration (start, end, or term) not found.")
        print("    Contract duration details not found.")
    elif not start_date:
        notes.append("Contract start date not found.")
    elif not end_date:
        notes.append("Contract end date (or term to calculate it) not found.")

    contract_data_dict['_notes'] = notes

SyntaxError: invalid syntax (2577684604.py, line 63)

In [None]:
import re
import calendar 
from collections import defaultdict

# Helper function to process a regex match and add to rates list
def _process_rate_match(match, match_type_debug, contract_data_dict, rates_list_ref, primary_unit_ref_list):
    notes = contract_data_dict.get("_notes", [])
    section_name_for_debug = contract_data_dict.get("_current_section_name_debug", "Unknown Section")

    is_ex1_contract_price_section = ("Example 1 (Contract)" in contract_data_dict.get("contract_id", "") and "Contract Price" in section_name_for_debug)
    if is_ex1_contract_price_section: 
        print(f"\nDEBUG MATCH (Ex1 - In Section: '{section_name_for_debug}', Pattern: {match_type_debug}):")
        print(f"  Line Matched by Regex: '{match.group(0).strip()}'")
        print(f"  Iterating over groups for this match:")
        for group_name_val_key in match.re.groupindex.keys():
            try:
                group_content = match.group(group_name_val_key)
                print(f"    Group ({group_name_val_key}): '{group_content}'")
            except IndexError: 
                print(f"    Group ({group_name_val_key}): DID NOT PARTICIPATE") 
        print("--- End of Groups for this Match ---")

    rate_value_str = match.group("value").replace(',', '').strip()
    if not rate_value_str: 
        return False 

    tier_name = None
    if "tier_prefix" in match.re.groupindex and match.group("tier_prefix"):
        tier_match_extract = re.search(r"Tier\s+\w+", match.group("tier_prefix"), re.IGNORECASE)
        if tier_match_extract:
            tier_name = tier_match_extract.group(0).strip()

    current_rate_unit = None
    current_currency = None

    if "unit_in_label" in match.re.groupindex and match.group("unit_in_label"):
        current_rate_unit = match.group("unit_in_label").upper()
        if "currency_in_label" in match.re.groupindex and match.group("currency_in_label"):
            current_currency = match.group("currency_in_label")
    elif "unit_after_value" in match.re.groupindex and match.group("unit_after_value"):
        current_rate_unit = match.group("unit_after_value").upper()
    elif "unit_val_in_paren" in match.re.groupindex and match.group("unit_val_in_paren"): # For (Unit)
        current_rate_unit = match.group("unit_val_in_paren").upper()

    if not current_currency and "currency_before_value" in match.re.groupindex and match.group("currency_before_value"):
        current_currency = match.group("currency_before_value")
    
    if not current_currency: current_currency = '$' 

    if not current_rate_unit:
        print(f"    Rate value '{rate_value_str}' found by {match_type_debug}, but unit not parsed. Line: '{match.group(0).strip()}'. Skipping.")
        notes.append(f"Rate value '{rate_value_str}' (Sec: '{section_name_for_debug}') no unit; skipped by {match_type_debug}.")
        return False

    std_unit = current_rate_unit
    if current_rate_unit == "DTHS": std_unit = "DTH"
    elif current_rate_unit == "MMBTUS": std_unit = "MMBTU"
    elif current_rate_unit == "KWH": std_unit = "KWH" 
    elif current_rate_unit == "THERMS": std_unit = "THERM"
    current_rate_unit = std_unit

    try:
        rate_value_float = float(rate_value_str)
        rate_display_unit = f"{current_currency}/{current_rate_unit}"

        if current_currency == '¢':
            rate_value_float /= 100.0
            rate_display_unit = f"$/{current_rate_unit}"
        
        rate_entry = {
            "rate": f"{rate_value_float:.5f}" if current_currency == '¢' else str(rate_value_float),
            "unit_full_string": rate_display_unit,
            "base_unit": current_rate_unit,
            "label_matched": match.group("label") # Assumes "label" group exists in both direct and index patterns
        }
        if "description_text_for_index" in match.re.groupindex and match.group("description_text_for_index"):
            rate_entry["index_description"] = match.group("description_text_for_index").strip()
        if "math_operator" in match.re.groupindex and match.group("math_operator"):
             rate_entry["index_operator"] = match.group("math_operator")
        
        if tier_name: rate_entry["tier"] = tier_name
        
        # Check for duplicates before adding
        is_duplicate_entry = False
        for existing_entry in rates_list_ref:
            if existing_entry.get("tier") == rate_entry.get("tier") and \
               existing_entry.get("base_unit") == rate_entry.get("base_unit") and \
               abs(float(existing_entry.get("rate", 0)) - float(rate_entry.get("rate", -1))) < 0.00001:
                is_duplicate_entry = True
                break
        
        if not is_duplicate_entry:
            rates_list_ref.append(rate_entry)
            print(f"    Found Rate ({match_type_debug}): {(tier_name + ': ') if tier_name else ''}{rate_entry['rate']} {rate_entry['unit_full_string']}")
            if not primary_unit_ref_list[0]: 
                primary_unit_ref_list[0] = current_rate_unit
            return True 
    except ValueError:
        notes.append(f"Could not parse rate value '{rate_value_str}' for unit '{current_rate_unit}'.")
        print(f"    Warning: Could not parse rate value '{rate_value_str}' from: {match.group(0).strip()}")
    except Exception as e:
        print(f"    Error processing rate ({match_type_debug}): {e} from: {match.group(0).strip()}")
    return False

In [149]:
def extract_rates_and_unit(sections, tables, contract_data_dict, markdown_content_for_fallback=""):
    notes = contract_data_dict.get("_notes", [])
    rates = []
    primary_unit_mutable = [None] 

    search_sources_for_rates = [] 
    sections_to_search_map = {}
    ordered_section_keywords = ["contract price", "purchase price", "pricing", "rate"] 

    for keyword in ordered_section_keywords:
        for section_name, section_content in sections.items():
            if keyword in section_name.lower():
                if section_name not in sections_to_search_map:
                    sections_to_search_map[section_name] = section_content
                    print(f"  Rates: Added priority section '{section_name}' for rates.")
    
    if not sections_to_search_map or not any(kw in name.lower() for name in sections_to_search_map for kw in ["contract price", "purchase price"]):
        print("  Rates: Broadening search or no primary rate sections found by name. Checking section content.")
        for section_name, section_content in sections.items():
            if section_name not in sections_to_search_map: 
                if any(key_term in section_content.lower() for key_term in ["price is", "price:", " rate:", "fixed at", "$/", "¢/", " per "]):
                    if section_name not in sections_to_search_map:
                        sections_to_search_map[section_name] = section_content
                        print(f"  Rates: Adding section '{section_name}' based on content clues.")
    
    if sections_to_search_map:
        for name, content in sections_to_search_map.items():
            search_sources_for_rates.append((name, content))
    elif sections: 
        for name, content in sections.items():
            search_sources_for_rates.append((name, content))
    elif markdown_content_for_fallback:
         search_sources_for_rates.append(("full_document_fallback", markdown_content_for_fallback))
    else: 
         print("  Rates: No section content to search.")


    rate_pattern_direct = re.compile(
        r"^(?:[\s\-*]*)"
        r"(?P<tier_prefix>(?:\*{0,2}Tier\s+\w+\*{0,2}\s*:\s*)?)" 
        r"\*{0,2}(?P<label>Fixed Price|Contract price)\*{0,2}" 
        r"\s*(?:\((?P<currency_in_label>[\$¢])?\s*/\s*(?P<unit_in_label>\w+)\))?" 
        r"\*{0,2}\s*[:\-]\s*" 
        r"(?P<currency_before_value>[\$¢])?" 
        r"\s*(?P<value>[\d.,]+)" 
        r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?"
        r"(?:\s*\((?P<unit_val_in_paren>\w+)\)\s*)?", # For (Dth)
        re.IGNORECASE | re.MULTILINE
    )

    rate_pattern_index = re.compile(
        r"^(?:[\s\-*]*)"
        r"(?P<tier_prefix>(?:\*{0,2}Tier\s+\w+\*{0,2}\s*:\s*)?)" 
        r"\*{0,2}(?P<label>Index Price)\*{0,2}" 
        r"\s*(?:\((?P<currency_in_label>[\$¢])?\s*/\s*(?P<unit_in_label>\w+)\))?" 
        r"\*{0,2}\s*[:\-]\s*" 
        r"(?P<description_text_for_index>.+?)" 
        r"(?P<math_operator>[+\-])\s*"
        r"(?P<currency_before_value>[\$¢])?" 
        r"\s*(?P<value>[\d.,]+)" 
        r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?"
        r"(?:\s*\((?P<unit_val_in_paren>\w+)\)\s*)?",
        re.IGNORECASE | re.MULTILINE
    )
    
    cents_rate_pattern = re.compile(
        r"(?P<value>[\d.,]+)\s+(?:¢|cents)\s*(?:per|/)\s*(?P<unit>KWh|MMBTU|Dth|Therm)\b",
        re.IGNORECASE | re.MULTILINE
    )
    
    unit_context_pattern = re.compile(
        r"(?:Volume|Quantity)\s+(?:per\s+Month\s+)?in\s+(?P<unit>Dths|MMBTUs|kWh|Therms)\b",
        re.IGNORECASE
    )

    # Use a set to track fully processed lines to avoid re-matching by different patterns if one succeeds
    globally_processed_rate_lines = set() 

    for section_name_for_debug, text_block_original in search_sources_for_rates:
        text_block = text_block_original.strip()
        contract_data_dict['_current_section_name_debug'] = section_name_for_debug

        if "Example 1 (Contract)" in contract_data_dict.get("contract_id", "") and \
           "Contract Price" in section_name_for_debug: # Specific section for Ex1
            print(f"\n[EX1 DEBUG] Iterating section '{section_name_for_debug}' for rates.")
            print(f"  Content (first 300 chars): '{text_block[:300]}...'")
            print(f"  REPR: {repr(text_block[:300])}")


        # Try Direct Pattern
        for match in rate_pattern_direct.finditer(text_block):
            line_matched_text = match.group(0).strip()
            if line_matched_text in globally_processed_rate_lines: continue
            if _process_rate_match(match, "direct", contract_data_dict, rates, primary_unit_mutable):
                globally_processed_rate_lines.add(line_matched_text)
        
        # Try Index Pattern
        for match in rate_pattern_index.finditer(text_block):
            line_matched_text = match.group(0).strip()
            if line_matched_text in globally_processed_rate_lines: continue
            if _process_rate_match(match, "index", contract_data_dict, rates, primary_unit_mutable):
                globally_processed_rate_lines.add(line_matched_text)

        # Try Cents Pattern
        for match in cents_rate_pattern.finditer(text_block):
            line_matched_text = match.group(0).strip()
            if line_matched_text in globally_processed_rate_lines: continue
            
            rate_value_str_cents = match.group("value").replace(',', '').strip()
            unit_cents = match.group("unit").upper()
            std_unit_cents = unit_cents
            if unit_cents == "DTHS": std_unit_cents = "DTH" # etc.

            try:
                rate_value_cents_float = float(rate_value_str_cents)
                rate_value_dollars = rate_value_cents_float / 100.0
                rate_entry = {
                    "rate": f"{rate_value_dollars:.5f}",
                    "unit_full_string": f"$/{std_unit_cents}",
                    "base_unit": std_unit_cents
                }
                # More robust duplicate check before adding
                is_duplicate_entry = False
                for existing_entry in rates:
                    if existing_entry.get("base_unit") == rate_entry.get("base_unit") and \
                       abs(float(existing_entry.get("rate", 0)) - float(rate_entry.get("rate", -1))) < 0.00001:
                        is_duplicate_entry = True; break
                
                if not is_duplicate_entry:
                    rates.append(rate_entry)
                    print(f"    Found Cents Rate: {rate_value_cents_float} cents/{std_unit_cents} -> {rate_entry['rate']} {rate_entry['unit_full_string']}")
                    if not primary_unit_mutable[0]: primary_unit_mutable[0] = std_unit_cents
                    globally_processed_rate_lines.add(line_matched_text) 
            except ValueError:
                notes.append(f"Could not parse cents rate '{rate_value_str_cents}'.")
                print(f"    Warning: Could not parse cents rate '{rate_value_str_cents}' from: {match_text}")


    primary_unit = primary_unit_mutable[0] # Get value from list
    # --- Infer Primary Unit of Measurement if not set by rates ---
    if not primary_unit:
        all_text_for_unit_context = " ".join(sections.values()) if sections else markdown_content_for_fallback
        context_match = unit_context_pattern.search(all_text_for_unit_context)
        if context_match:
            unit_from_context = context_match.group("unit").upper()
            if unit_from_context in ["DTHS", "DTH"]: primary_unit = "DTH"
            elif unit_from_context == "MMBTUS": primary_unit = "MMBTU"
            elif unit_from_context == "KWH": primary_unit = "KWH"
            elif unit_from_context == "THERMS": primary_unit = "THERM"
            if primary_unit:
                notes.append(f"Primary UoM '{primary_unit}' inferred from document context.")
                print(f"    Inferred Primary UoM from context: {primary_unit}")

    if not primary_unit:
        all_text_content = (" ".join(sections.values()) if sections else markdown_content_for_fallback).upper()
        if "ELECTRIC" in all_text_content: primary_unit = "KWH"; notes.append("UoM 'KWH' from 'ELECTRIC' keyword.")
        elif "GAS" in all_text_content: primary_unit = "MMBTU"; notes.append("UoM 'MMBTU' from 'GAS' keyword.")
        if primary_unit: print(f"    Inferred Primary UoM by keyword: {primary_unit}")
    
    if not rates:
        notes.append("No contracted rate information found.")
        print("    Warning: No contracted rates found.")
    
    contract_data_dict['contracted_rates'] = rates
    contract_data_dict['unit_of_measurement'] = primary_unit
    contract_data_dict['_notes'] = notes

In [151]:
if __name__ == "__main__":
    all_extracted_data = []
    file_count = 0

    for md_file_path in find_contract_files(BASE_OUTPUT_DIR):
        print(f"\nProcessing: {md_file_path.parent.name}/{md_file_path.name}")
        file_count += 1
        
        markdown_content = read_markdown_file(md_file_path)
        if not markdown_content:
            continue

        contract_id = md_file_path.parent.name
        current_contract_data = {
            "contract_id": contract_id,
            "_source_file": str(md_file_path.relative_to(BASE_OUTPUT_DIR)),
            "_notes": [],
            "contract_duration": {"start_date": None, "end_date": None},
            "monthly_forecasted_usage": {}, # Will be dict of YYYY-MM: value
            "contracted_rates": [],
            "unit_of_measurement": None,
            "rate_class": None, 
            "service_addresses": [],
            "account_numbers": [],
            "meter_numbers": []
        }

        parsed_structure = parse_markdown_to_structured_data(markdown_content)
        sections = parsed_structure.get("sections", {})
        tables = parsed_structure.get("tables", [])
        
        print(f"  Found {len(sections)} refined sections: {list(sections.keys())}")
        if tables:
            print(f"  Found {len(tables)} tables (via HTML parsing).")
            for idx, table_df in enumerate(tables):
                print(f"    Table {idx} columns: {table_df.columns.tolist()}")
        else:
            print("  No tables found by HTML parsing.")

        # --- Call Extraction Functions ---
        extract_contract_duration(sections, tables, current_contract_data)
        extract_rates_and_unit(sections, tables, current_contract_data)
        
        print(f"  Duration for {contract_id}: {current_contract_data['contract_duration']}")
        print(f"  Rates for {contract_id}: {current_contract_data['contracted_rates']}")
        print(f"  Unit for {contract_id}: {current_contract_data['unit_of_measurement']}")

        all_extracted_data.append(current_contract_data)

    print(f"\n--- Processed {file_count} contract files. ---")

    if all_extracted_data:
        output_summary_file = JSON_OUTPUT_DIR / "hybrid_summary.json"
        try:
            with open(output_summary_file, 'w', encoding='utf-8') as f:
                json.dump(all_extracted_data, f, indent=2, ensure_ascii=False)
            print(f"Successfully saved summary to {output_summary_file}")
        except Exception as e:
            print(f"Error saving summary file: {e}")
    else:
        print("No data extracted to save.")

Searching for contract folders in: /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs

Processing: Example 2 (Contract)/Example 2 (Contract).md
  Successfully read 30822 characters from Example 2 (Contract).md.
  Found 13 refined sections: ['CUSTOMER INFORMATION', 'CONTRACT INFORMATION', 'ACKNOWLEDGMENT AND CONSENT', 'FACILITIES/ACCOUNTS', 'SECTION 1. TERMS OF SERVICE', 'SECTION 2. GENERAL TERMS AND CONDITIONS', 'SECTION 3. DEFINITIONS', 'Service Options', 'Customer Information and Authorization', 'Requestor & Billing Information', 'Additional Locations', 'Supplier/Consultant Information (please print):', 'Operating Company Information (please print):']
  Found 8 tables (via HTML parsing).
    Table 0 columns: ['Field', 'Details']
    Table 1 columns: ['Signature', 'Customer', 'ENGIE']
    Table 2 columns: ['NO.', 'FACILITY NAME/ SERVICE ADDRESS', 'CITY, STATE, ZIP', 'UTILITY', 'DELIVERY POINT', 'ACCOUNT NUMBER']
    Table 3 columns: ['1.BA 51823187091', 'SA 211953002'

In [125]:
# debugging for ex1

import re
line = "- **Tier One:** Fixed Price: $4.40 per Dth"
pattern_segment = re.compile(
    r"(?P<description_before_math>.*?)"              # F
    r"(?P<math_operator>[+\-])?\s*"                  # G
    r"(?P<currency_before_value>[\$¢])?"             # H
    r"\s*(?P<value>[\d.,]+)"                         # I
    r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?", # J
    re.IGNORECASE
)
remaining_text = " $4.40 per Dth" 
m = pattern_segment.match(remaining_text) 
if m:
    print(f"Desc: '{m.group('description_before_math')}'")
    print(f"Operator: '{m.group('math_operator')}'")
    print(f"Currency: '{m.group('currency_before_value')}'")
    print(f"Value: '{m.group('value')}'")
    print(f"Unit: '{m.group('unit_after_value')}'")
else:
    print("Segment did not match")

# For the Index Price line:
remaining_text_index = " Gas Market Report - Colorado Interstate Gas + $.99 per Dth"
m_index = pattern_segment.match(remaining_text_index)
if m_index:
    print("\nIndex Line:")
    print(f"Desc: '{m_index.group('description_before_math')}'")
    print(f"Operator: '{m_index.group('math_operator')}'")
    print(f"Currency: '{m_index.group('currency_before_value')}'")
    print(f"Value: '{m_index.group('value')}'")
    print(f"Unit: '{m_index.group('unit_after_value')}'")
else:
    print("Index segment did not match")

Desc: ''
Operator: 'None'
Currency: '$'
Value: '4.40'
Unit: 'Dth'

Index Line:
Desc: ' Gas Market Report - Colorado Interstate Gas '
Operator: '+'
Currency: '$'
Value: '.99'
Unit: 'Dth'
