## Imports and Utility Functions

In [260]:
import re
import json
from pathlib import Path
import pandas as pd # For potential table handling later
from markdown_it import MarkdownIt # For Markdown parsing

# --- Configuration ---
BASE_OUTPUT_DIR = Path("/Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs")
JSON_OUTPUT_SUBDIR = "hybrid_json_outputs"
JSON_OUTPUT_DIR = BASE_OUTPUT_DIR / JSON_OUTPUT_SUBDIR
JSON_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Utility Functions ---

def find_contract_files(base_dir):
    """
    Finds all .md contract files within '...(Contract)' subdirectories.
    Yields Path objects to the .md files.
    """
    print(f"Searching for contract folders in: {base_dir.resolve()}")
    found_any = False
    for item in base_dir.iterdir():
        if item.is_dir() and '(Contract)' in item.name:
            contract_folder = item
            md_files = list(contract_folder.glob('*.md'))
            if md_files:
                if len(md_files) > 1:
                    print(f"  Warning: Multiple .md files in {contract_folder.name}. Using first: {md_files[0].name}")
                yield md_files[0] # Yield the first .md file found
                found_any = True
            else:
                print(f"  Warning: No .md file found in {contract_folder.name}")
    if not found_any:
        print("  No contract folders with .md files found.")

def read_markdown_file(md_filepath):
    """Reads content from a markdown file."""
    try:
        with open(md_filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        print(f"  Successfully read {len(content)} characters from {md_filepath.name}.")
        return content
    except Exception as e:
        print(f"  Error reading file {md_filepath.name}: {e}")
        return None

In [262]:
from dateutil.parser import parse as parse_date_flexible
from dateutil.relativedelta import relativedelta

def normalize_date_robust(date_str):
    """Attempts to parse a date string into YYYY-MM-DD, handles more cases."""
    if not date_str or not isinstance(date_str, str):
        return None
    try:
        date_str_cleaned = re.sub(r"(\d+)(?:st|nd|rd|th)\s+day\s+of\s+", "", date_str, flags=re.IGNORECASE)
        dt = parse_date_flexible(date_str_cleaned)
        return dt.strftime('%Y-%m-%d')
    except (ValueError, TypeError, OverflowError) as e:
        print(f"    Debug: normalize_date_robust failed for '{date_str}': {e}")
        return None

def calculate_end_date_from_term(start_date_str, months_str):
    """Calculates end date from start date and term in months."""
    if not start_date_str or not months_str:
        return None
    try:
        start_date = parse_date_flexible(start_date_str)
        months = int(months_str)
        end_date = start_date + relativedelta(months=months) - relativedelta(days=1) 
        return end_date.strftime('%Y-%m-%d')
    except (ValueError, TypeError):
        return None

# --- New Extraction Function ---
def extract_contract_duration(sections, tables, contract_data_dict):
    """
    Extracts contract start and end dates.
    Populates contract_data_dict['contract_duration'] and contract_data_dict['_notes'].
    """
    notes = contract_data_dict.get("_notes", [])
    start_date = None
    end_date = None
    term_months_val = None

    # --- Define Search Space ---
    # Prioritize specific sections if they exist, otherwise search all section content
    search_texts = []    
    term_section_keys = [k for k in sections if "term" in k.lower() and "termination" not in k.lower()] # Avoid "Early Termination"
    delivery_period_keys = [k for k in sections if "delivery period" in k.lower()]
    contract_info_keys = [k for k in sections if "contract information" in k.lower()]
    
    if delivery_period_keys:
        search_texts.append(sections[delivery_period_keys[0]])
        print(f"  Duration: Searching in '{delivery_period_keys[0]}' section.")
    if term_section_keys:
        search_texts.append(sections[term_section_keys[0]])
        print(f"  Duration: Searching in '{term_section_keys[0]}' section.")
    if contract_info_keys:
        search_texts.append(sections[contract_info_keys[0]])
        print(f"  Duration: Searching in '{contract_info_keys[0]}' section.")
    
    # Fallback to searching all section content if specific sections aren't found or don't yield results
    if not search_texts:
        print("  Duration: No specific duration sections found, searching all section content.")
        search_texts.extend(sections.values()) # Search text of all sections

    # Regex for dates
    date_val_pattern = r"([\w\s,]+\d{1,2}[,\s]+(?:19|20)\d{2}|\d{1,2}[-/]\d{1,2}[-/](?:19|20)?\d{2})"
    # date_val_pattern matches: "Month Day, Year", "Day Month, Year", "MM/DD/YYYY", "MM-DD-YY", etc.

    start_patterns = [
        re.compile(rf"Begin\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"Start Date\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"Effective\s*(?:the date of this contract or date of first gas deliveries available thereafter)?\s*(?:for|on)?\s*{date_val_pattern}", re.IGNORECASE), # Example 1 "effective the date..."
    ]
    end_patterns = [
        re.compile(rf"End\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
        re.compile(rf"End Date\s*[:*]*\s*{date_val_pattern}", re.IGNORECASE),
    ]
    term_patterns = [
        re.compile(r"for\s+(\d+)\s+months", re.IGNORECASE), # Example 1 "for 36 months"
        re.compile(r"Term.*?for\s+(\d+)\s+months", re.IGNORECASE | re.DOTALL),
    ]
    executed_date_pattern = re.compile(rf"Executed this\s+{date_val_pattern}", re.IGNORECASE) # Example 1

    # --- Extraction Logic ---
    for text_block in search_texts:
        if not start_date:
            for pattern in start_patterns:
                match = pattern.search(text_block)
                if match:
                    start_date = normalize_date_robust(match.group(1)) # Group 1 is date_val_pattern
                    if start_date: break 
            if start_date: print(f"    Start Date found: {start_date}")
        
        if not end_date:
            for pattern in end_patterns:
                match = pattern.search(text_block)
                if match:
                    end_date = normalize_date_robust(match.group(1))
                    if end_date: break
            if end_date: print(f"    End Date found: {end_date}")

        if not term_months_val: # Only look for term if end_date isn't directly found
            for pattern in term_patterns:
                match = pattern.search(text_block)
                if match:
                    term_months_val = match.group(1)
                    if term_months_val: break
            if term_months_val: print(f"    Term (months) found: {term_months_val}")
        
        if start_date and end_date: # Stop if both found
            break
    
    if not start_date:
        for text_block in search_texts: # Re-search if not found in primary search
            match = executed_date_pattern.search(text_block)
            if match:
                start_date = normalize_date_robust(match.group(1))
                if start_date:
                    print(f"    Start Date (from Executed this): {start_date}")
                    break
    
    # Calculate end_date if start_date and term_months are found but no end_date
    if start_date and term_months_val and not end_date:
        end_date = calculate_end_date_from_term(start_date, term_months_val)
        if end_date:
            print(f"    End Date (calculated from term): {end_date}")
            notes.append(f"End date calculated from {term_months_val}-month term.")
        else:
            notes.append(f"Found start date and term ({term_months_val} months), but failed to calculate end date.")

    # Update contract_data_dict
    contract_data_dict['contract_duration']['start_date'] = start_date
    contract_data_dict['contract_duration']['end_date'] = end_date

    if not start_date and not end_date and not term_months_val:
        notes.append("Contract duration (start, end, or term) not found.")
        print("    Contract duration details not found.")
    elif not start_date:
        notes.append("Contract start date not found.")
    elif not end_date:
        notes.append("Contract end date (or term to calculate it) not found.")

    contract_data_dict['_notes'] = notes

In [284]:
import re
import calendar 
from collections import defaultdict

# This is the _process_rate_match that corresponds to the single rate_pattern_general
# which previously worked for Ex2 and Ex4.
def _process_single_general_match(match, contract_data_dict, rates_list_ref, primary_unit_ref_list):
    notes = contract_data_dict.get("_notes", [])
    section_name_for_debug = contract_data_dict.get("_current_section_name_debug", "Unknown Section")

    # --- ENABLE THIS DEBUG BLOCK ---
    is_target_debug_line = False
    contract_id = contract_data_dict.get("contract_id", "")
    line_text_for_debug = match.group(0).strip()

    if "Example 1 (Contract)" in contract_id and ("Tier One" in line_text_for_debug or "Tier Two" in line_text_for_debug):
        is_target_debug_line = True
    elif "Example 2 (Contract)" in contract_id and "Contract price ($/kwh):** 0.09623" in line_text_for_debug :
         is_target_debug_line = True


    if is_target_debug_line: 
        print(f"\nDEBUG HELPER MATCH (Target Contract/Section: '{contract_id}'/'{section_name_for_debug}'):")
        print(f"  Line Matched by Regex: '{line_text_for_debug}'")
        print(f"  Iterating over groups for this match:")
        for group_name_val_key in match.re.groupindex.keys():
            try:
                group_content = match.group(group_name_val_key)
                print(f"    Group ({group_name_val_key}): '{group_content}'")
            except IndexError: 
                print(f"    Group ({group_name_val_key}): DID NOT PARTICIPATE") 
        print("--- End of Groups for this Match ---")
    # --- END DEBUG ---

    rate_value_str = match.group("value").replace(',', '').strip()
    if not rate_value_str: 
        return False 

    tier_name = None
    if match.group("tier_prefix"): # No need for "in match.re.groupindex" if group is always defined
        tier_match_extract = re.search(r"Tier\s+\w+", match.group("tier_prefix"), re.IGNORECASE)
        if tier_match_extract:
            tier_name = tier_match_extract.group(0).strip()

    actual_label = None
    if match.group("label"): # "label" is the named group for (Fixed Price|Contract price|Index Price)
        label_from_group = match.group("label") # This group itself doesn't have the stars
        actual_label = label_from_group.strip() # The surrounding \*{0,2} handle stars

    current_rate_unit = None
    current_currency = None

    if match.group("unit_in_label"):
        current_rate_unit = match.group("unit_in_label").upper()
        if match.group("currency_in_label"):
            current_currency = match.group("currency_in_label")
    elif match.group("unit_after_value"):
        current_rate_unit = match.group("unit_after_value").upper()
    elif match.group("unit_val_in_paren"): 
        current_rate_unit = match.group("unit_val_in_paren").upper()

    if not current_currency and match.group("currency_before_value"):
        current_currency = match.group("currency_before_value")
    
    if not current_currency: current_currency = '$' 

    if not current_rate_unit:
        # This should only print if a value was found but absolutely no unit could be derived from any group
        print(f"    Rate value '{rate_value_str}' found, but unit not parsed. Line: '{match.group(0).strip()}'. Desc: '{match.group('description_before_math')}' Skipping.")
        notes.append(f"Rate value '{rate_value_str}' (Sec: '{section_name_for_debug}') no unit; skipped.")
        return False

    std_unit = current_rate_unit
    if current_rate_unit == "DTHS": std_unit = "DTH"
    elif current_rate_unit == "MMBTUS": std_unit = "MMBTU"
    elif current_rate_unit == "KWH": std_unit = "KWH" 
    elif current_rate_unit == "THERMS": std_unit = "THERM"
    current_rate_unit = std_unit

    try:
        rate_value_float = float(rate_value_str)
        rate_display_unit = f"{current_currency}/{current_rate_unit}"

        if current_currency == '¢':
            rate_value_float /= 100.0
            rate_display_unit = f"$/{current_rate_unit}"
        
        rate_entry = {
            "rate": f"{rate_value_float:.5f}" if current_currency == '¢' else str(rate_value_float),
            "unit_full_string": rate_display_unit,
            "base_unit": current_rate_unit,
            "label_matched": actual_label 
        }
        desc_text = match.group("description_before_math")
        math_op = match.group("math_operator")
        if desc_text and desc_text.strip() and desc_text.strip() != "**": # Avoid storing just "**"
            rate_entry["index_description"] = desc_text.strip()
        if math_op: 
            rate_entry["index_operator"] = math_op
        
        if tier_name: rate_entry["tier"] = tier_name
        
        is_duplicate_entry = any(
            ex_rate.get("tier") == rate_entry.get("tier") and \
            ex_rate.get("base_unit") == rate_entry.get("base_unit") and \
            abs(float(ex_rate.get("rate", 0)) - float(rate_entry.get("rate", -1))) < 0.00001
            for ex_rate in rates_list_ref
        )
        
        if not is_duplicate_entry:
            rates_list_ref.append(rate_entry)
            print(f"    Found Rate (single_general_pattern): {(tier_name + ': ') if tier_name else ''}{rate_entry['rate']} {rate_entry['unit_full_string']}")
            if not primary_unit_ref_list[0]: 
                primary_unit_ref_list[0] = current_rate_unit
            return True 
    except ValueError:
        notes.append(f"Could not parse rate value '{rate_value_str}' for unit '{current_rate_unit}'.")
        print(f"    Warning: Could not parse rate value '{rate_value_str}' from: {match.group(0).strip()}")
    except Exception as e:
        print(f"    Error processing rate (single_general_pattern): {e} from: {match.group(0).strip()}")
    return False


def extract_rates_and_unit(sections, tables, contract_data_dict, markdown_content_for_fallback=""):
    notes = contract_data_dict.get("_notes", [])
    rates = []
    primary_unit_mutable = [None] 

    # --- Search Space Logic (Keep the refined version) ---
    search_sources_for_rates = [] 
    sections_to_search_map = {}
    ordered_section_keywords = ["contract price", "purchase price", "pricing", "rate"] 

    for keyword in ordered_section_keywords:
        for section_name, section_content in sections.items():
            if keyword in section_name.lower():
                if section_name not in sections_to_search_map:
                    sections_to_search_map[section_name] = section_content
    
    if not sections_to_search_map or not any(kw in name.lower() for name in sections_to_search_map for kw in ["contract price", "purchase price"]):
        for section_name, section_content in sections.items():
            if section_name not in sections_to_search_map: 
                if any(key_term in section_content.lower() for key_term in ["price is", "price:", " rate:", "fixed at", "$/", "¢/", " per "]):
                    if section_name not in sections_to_search_map:
                        sections_to_search_map[section_name] = section_content
    
    if sections_to_search_map:
        for name, content in sections_to_search_map.items():
            search_sources_for_rates.append((name, content))
            print(f"  Rates: Using section '{name}' for rates search.") # Added print
    elif sections: 
        print("  Rates: No specific sections by name/content, using all sections.")
        for name, content in sections.items():
            search_sources_for_rates.append((name, content))
    elif markdown_content_for_fallback:
         search_sources_for_rates.append(("full_document_fallback", markdown_content_for_fallback))
    else: 
         print("  Rates: No section content to search.")


    # --- Regex Patterns (Reverted to the single general pattern that worked for Ex2 & Ex4) ---
    rate_pattern_general = re.compile(
        r"^(?:[\s\-*]*)"
        r"(?P<tier_prefix>(?:\*{0,2}Tier\s+\w+\*{0,2}\s*:\s*)?)" 
        r"\*{0,2}(?P<label>Fixed Price|Contract price|Index Price)\*{0,2}" 
        r"\s*(?:\((?P<currency_in_label>[\$¢])?\s*/\s*(?P<unit_in_label>\w+)\))?" 
        r"\*{0,2}\s*[:\-]\s*" 
        r"(?P<description_before_math>.*?)" 
        r"(?P<math_operator>[+\-])?\s*"
        r"(?P<currency_before_value>[\$¢])?" 
        r"\s*(?P<value>[\d.,]+)" 
        r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?"
        r"(?:\s*\((?P<unit_val_in_paren>\w+)\)\s*)?", 
        re.IGNORECASE
    )
    
    cents_rate_pattern = re.compile(
        r"^(?:[\s\-*]*)" 
        r"(?P<value>[\d.,]+)\s+(?:¢|cents)\s*(?:per|/)\s*(?P<unit>KWh|MMBTU|Dth|Therm)\b",
        re.IGNORECASE 
    )
    
    unit_context_pattern = re.compile( # This is fine for broader searches
        r"(?:Volume|Quantity)\s+(?:per\s+Month\s+)?in\s+(?P<unit>Dths|MMBTUs|kWh|Therms)\b",
        re.IGNORECASE
    )

    globally_processed_rate_lines = set() 

    for section_name_for_debug, text_block_original in search_sources_for_rates:
        text_block_stripped = text_block_original.strip() 
        contract_data_dict['_current_section_name_debug'] = section_name_for_debug

        # --- DEBUG PRINT FOR SECTION BEING PROCESSED FOR EX1 ---
        if "Example 1 (Contract)" in contract_data_dict.get("contract_id", "") and \
           ("Contract Price" in section_name_for_debug if section_name_for_debug else False):
            print(f"\n[EX1 DEBUG] Processing Section '{section_name_for_debug}' line by line for rates.")
            # print(f"  Content sample: '{text_block_stripped[:300]}...'") # Optional

        for line_count, line in enumerate(text_block_stripped.splitlines()):
            line = line.strip() 
            if not line or line in globally_processed_rate_lines:
                continue
            
            # --- DEBUG PRINT FOR EACH LINE FOR EX1 ---
            if "Example 1 (Contract)" in contract_data_dict.get("contract_id", "") and \
               ("Contract Price" in section_name_for_debug if section_name_for_debug else False):
                print(f"  [EX1 L{line_count+1}] Attempting match on: '{line}'")
            # ---

            match_general = rate_pattern_general.search(line) 
            if match_general:
                if _process_single_general_match(match_general, contract_data_dict, rates, primary_unit_mutable):
                    globally_processed_rate_lines.add(line)
                    continue 

            if line not in globally_processed_rate_lines:
                match_cents = cents_rate_pattern.search(line)
                if match_cents:
                    rate_value_str_cents = match_cents.group("value").replace(',', '').strip()
                    unit_cents = match_cents.group("unit").upper()
                    std_unit_cents = unit_cents
                    if unit_cents == "DTHS": std_unit_cents = "DTH"

                    try:
                        rate_value_cents_float = float(rate_value_str_cents)
                        rate_value_dollars = rate_value_cents_float / 100.0
                        rate_entry = {
                            "rate": f"{rate_value_dollars:.5f}",
                            "unit_full_string": f"$/{std_unit_cents}",
                            "base_unit": std_unit_cents
                        }
                        is_duplicate = any(
                            r_existing["base_unit"] == std_unit_cents and \
                            abs(float(r_existing["rate"]) - rate_value_dollars) < 0.00001 
                            for r_existing in rates
                        )
                        if not is_duplicate:
                            rates.append(rate_entry)
                            print(f"    Found Cents Rate: {rate_value_cents_float} cents/{std_unit_cents} -> {rate_entry['rate']} {rate_entry['unit_full_string']}")
                            if not primary_unit_mutable[0]: primary_unit_mutable[0] = std_unit_cents
                            globally_processed_rate_lines.add(line)
                    except ValueError:
                        notes.append(f"Could not parse cents rate '{rate_value_str_cents}'.")

    primary_unit = primary_unit_mutable[0] 
    # --- Infer Primary Unit (logic remains same) ---
    if not primary_unit:
        all_text_for_unit_context = " ".join(s[1] for s in search_sources_for_rates) if search_sources_for_rates else markdown_content_for_fallback
        context_match = unit_context_pattern.search(all_text_for_unit_context)
        if context_match:
            unit_from_context = context_match.group("unit").upper()
            if unit_from_context in ["DTHS", "DTH"]: primary_unit = "DTH"
            elif unit_from_context == "MMBTUS": primary_unit = "MMBTU"
            elif unit_from_context == "KWH": primary_unit = "KWH"
            elif unit_from_context == "THERMS": primary_unit = "THERM"
            if primary_unit:
                notes.append(f"Primary UoM '{primary_unit}' inferred from document context.")
                # print(f"    Inferred Primary UoM from context: {primary_unit}")

    if not primary_unit:
        all_text_content = (" ".join(s[1] for s in search_sources_for_rates) if search_sources_for_rates else markdown_content_for_fallback).upper()
        if "ELECTRIC" in all_text_content: primary_unit = "KWH"; notes.append("UoM 'KWH' from 'ELECTRIC' keyword.")
        elif "GAS" in all_text_content: primary_unit = "MMBTU"; notes.append("UoM 'MMBTU' from 'GAS' keyword.")
        # if primary_unit: print(f"    Inferred Primary UoM by keyword: {primary_unit}")
    
    if not rates:
        notes.append("No contracted rate information found.")
        print(f"  Rates: Warning: No contracted rates found for {contract_data_dict.get('contract_id')}.")
    
    contract_data_dict['contracted_rates'] = rates
    contract_data_dict['unit_of_measurement'] = primary_unit
    contract_data_dict['_notes'] = notes

In [286]:
# debugging for ex1

import re
line = "- **Tier One:** Fixed Price: $4.40 per Dth"
pattern_segment = re.compile(
    r"(?P<description_before_math>.*?)"              # F
    r"(?P<math_operator>[+\-])?\s*"                  # G
    r"(?P<currency_before_value>[\$¢])?"             # H
    r"\s*(?P<value>[\d.,]+)"                         # I
    r"(?:\s*(?:per|/)\s*(?P<unit_after_value>KWh|MMBTU|Dth|Therm)\b)?", # J
    re.IGNORECASE
)
remaining_text = " $4.40 per Dth" 
m = pattern_segment.match(remaining_text) 
if m:
    print(f"Desc: '{m.group('description_before_math')}'")
    print(f"Operator: '{m.group('math_operator')}'")
    print(f"Currency: '{m.group('currency_before_value')}'")
    print(f"Value: '{m.group('value')}'")
    print(f"Unit: '{m.group('unit_after_value')}'")
else:
    print("Segment did not match")

# For the Index Price line:
remaining_text_index = " Gas Market Report - Colorado Interstate Gas + $.99 per Dth"
m_index = pattern_segment.match(remaining_text_index)
if m_index:
    print("\nIndex Line:")
    print(f"Desc: '{m_index.group('description_before_math')}'")
    print(f"Operator: '{m_index.group('math_operator')}'")
    print(f"Currency: '{m_index.group('currency_before_value')}'")
    print(f"Value: '{m_index.group('value')}'")
    print(f"Unit: '{m_index.group('unit_after_value')}'")
else:
    print("Index segment did not match")

Desc: ''
Operator: 'None'
Currency: '$'
Value: '4.40'
Unit: 'Dth'

Index Line:
Desc: ' Gas Market Report - Colorado Interstate Gas '
Operator: '+'
Currency: '$'
Value: '.99'
Unit: 'Dth'


In [288]:
import calendar # Ensure this is at the top
from dateutil.parser import parse as parse_date_flexible # Already should be there
from dateutil.relativedelta import relativedelta # Already should be there

def generate_full_yyyymm_sequence(start_date_str, end_date_str):
    """Generates a list of 'YYYY-MM' strings for the contract duration."""
    if not start_date_str or not end_date_str:
        return []
    
    try:
        start_dt = parse_date_flexible(start_date_str)
        end_dt = parse_date_flexible(end_date_str)
        
        yyyymm_list = []
        current_dt = start_dt
        while current_dt <= end_dt:
            yyyymm_list.append(current_dt.strftime('%Y-%m'))
            current_dt += relativedelta(months=1)
        return yyyymm_list
    except Exception as e:
        print(f"    Warning: Could not generate YYYY-MM sequence from {start_date_str} to {end_date_str}: {e}")
        return []

def map_month_name_to_int(month_name_str):
    """Maps month name (Jan, January, etc.) to month number (1-12)."""
    if not month_name_str or not isinstance(month_name_str, str):
        return None
    try:
        # Try parsing with dateutil first (flexible)
        return parse_date_flexible(month_name_str).month
    except ValueError:
        # Fallback for short names if parser fails
        month_name_lower_abbr = month_name_str.strip().lower()[:3]
        month_map = {name.lower(): num for num, name in enumerate(calendar.month_abbr) if num > 0}
        return month_map.get(month_name_lower_abbr)


def extract_monthly_usage(sections, tables, contract_data_dict):
    """
    Extracts monthly forecasted usage from tables.
    Populates contract_data_dict['monthly_forecasted_usage'].
    """
    notes = contract_data_dict.get("_notes", [])
    forecast_usage = {} # YYYY-MM: value

    contract_start_date = contract_data_dict.get('contract_duration', {}).get('start_date')
    contract_end_date = contract_data_dict.get('contract_duration', {}).get('end_date')

    if not contract_start_date or not contract_end_date:
        notes.append("Cannot extract monthly usage without contract start/end dates for year assignment.")
        print("  Usage: Skipping monthly usage extraction - missing contract duration.")
        contract_data_dict['monthly_forecasted_usage'] = {} # Ensure it's an empty dict
        contract_data_dict['_notes'] = notes
        return

    found_usage_table = False
    raw_monthly_values = [] # List of (month_int, usage_value) tuples

    for table_idx, df in enumerate(tables):
        # Normalize column names for easier matching
        print(f"  Usage: Analyzing Table {table_idx} with columns: {df.columns.tolist()}") # Debug table columns
        
        # Convert all column names to lower string, strip spaces, handle non-string cols
        df_columns_normalized = {}
        for col in df.columns:
            try:
                norm_col = str(col).lower().replace(' ', '').replace('_', '').replace('.', '')
                df_columns_normalized[norm_col] = col # Map normalized to original
            except: # Handle non-string column names if any by skipping them
                pass

        month_col_orig = None
        usage_col_orig = None
        
        # Common month column names
        month_col_candidates = ['month', 'mon']
        for cand_norm in month_col_candidates:
            if cand_norm in df_columns_normalized:
                month_col_orig = df_columns_normalized[cand_norm]
                break
        
        # Common usage/volume/quantity column names
        usage_col_candidates = ['volume', 'quantity', 'monthlyquantity', 'usage', 'baseloadvolume']
        for cand_norm in usage_col_candidates:
            if cand_norm in df_columns_normalized:
                usage_col_orig = df_columns_normalized[cand_norm]
                break
        
        if month_col_orig and usage_col_orig:
            print(f"    Usage: Found potential usage table {table_idx} (Cols: '{month_col_orig}', '{usage_col_orig}')")
            found_usage_table = True
            for _, row in df.iterrows():
                try:
                    month_name = str(row[month_col_orig])
                    usage_val_str = str(row[usage_col_orig]).replace(',', '').strip()

                    month_int = map_month_name_to_int(month_name)
                    
                    if month_int and usage_val_str and usage_val_str.isdigit():
                        usage_val = int(usage_val_str)
                        raw_monthly_values.append((month_int, usage_val))
                        print(f"      Raw usage added: Month {month_int}, Value {usage_val}") # Debug
                    else:
                        print(f"      Skipping row: Month='{month_name}' (Int: {month_int}), Usage='{usage_val_str}'") # Debug
                except KeyError:
                    print(f"      KeyError accessing row in table {table_idx}. Columns might be inconsistent.")
                    continue # Skip row if keys don't exist (e.g. malformed table)
                except Exception as e:
                    print(f"      Error processing row in table {table_idx}: {e}")
                    continue
            
            if len(raw_monthly_values) >= 12: break # Optimization: if we have 12 months, assume it's the primary table
    
    if not found_usage_table:
        notes.append("No table found with recognizable month and usage columns.")
        print("    Usage: No monthly usage table identified.")
    
    if raw_monthly_values:
        # Aggregate values if same month appears multiple times (e.g. from different tables/locations)
        aggregated_raw_values = defaultdict(int)
        for m_int, val in raw_monthly_values:
            aggregated_raw_values[m_int] += val
        
        sorted_monthly_pattern = sorted(aggregated_raw_values.items()) 
        
        if not sorted_monthly_pattern:
            notes.append("Processed raw usage values but resulted in empty pattern.")
            print("    Usage: Aggregated raw usage values but pattern is empty.")
        else:
            num_pattern_months = len(sorted_monthly_pattern)
            print(f"    Usage: Using a {num_pattern_months}-month usage pattern from table(s).")
            
            all_contract_yyyymm = generate_full_yyyymm_sequence(contract_start_date, contract_end_date)
            if not all_contract_yyyymm:
                notes.append("Could not generate YYYY-MM sequence for usage assignment.")
                print("    Usage: Error generating YYYY-MM sequence for contract duration.")
            else:
                for i, yyyymm in enumerate(all_contract_yyyymm):
                    current_month_int = int(yyyymm.split('-')[1])
                    
                    # Find the value for this current_month_int from our pattern
                    # This simple cyclical mapping assumes the table gives one year's pattern
                    # If the table spans multiple years explicitly, this needs more advanced logic.
                    
                    # Option 1: Simple cyclical based on month number matching pattern
                    value_for_month = 0
                    for pat_month_int, pat_value in sorted_monthly_pattern:
                        if pat_month_int == current_month_int:
                            value_for_month = pat_value
                            break
                    forecast_usage[yyyymm] = value_for_month

                    
                    if num_pattern_months >= len(all_contract_yyyymm) and num_pattern_months > 0 : 
                        # Sticking to simpler cyclical pattern for now based on month number:
                        value_for_month = 0
                        pattern_idx_for_month = (current_month_int - 1) % num_pattern_months # Simple map to pattern
                        # This above line is wrong if num_pattern_months != 12.
                        # It should be: find 'current_month_int' in 'sorted_monthly_pattern'
                        
                        found_in_pattern = False
                        for pat_m_int, pat_val in sorted_monthly_pattern:
                            if pat_m_int == current_month_int:
                                forecast_usage[yyyymm] = pat_val
                                found_in_pattern = True
                                break
                        if not found_in_pattern and sorted_monthly_pattern: # Fallback if month not in pattern (e.g. 11 month pattern)
                            # Use a default or first/last value from pattern, or 0
                            forecast_usage[yyyymm] = sorted_monthly_pattern[i % num_pattern_months][1] if num_pattern_months > 0 else 0


                    elif num_pattern_months > 0: # Standard cyclical application (e.g. 12 month pattern)
                        value_for_current_month = 0
                        for p_month, p_value in sorted_monthly_pattern:
                            if p_month == current_month_int:
                                value_for_current_month = p_value
                                break
                        forecast_usage[yyyymm] = sorted_monthly_pattern[i % num_pattern_months][1]

                notes.append(f"Applied {num_pattern_months}-month usage pattern over contract duration.")
                print(f"    Usage: Assigned usage for {len(forecast_usage)} months.")

    if not forecast_usage: # If no table data, check for annual total as fallback
        print("    Usage: No monthly table data. Checking for Annual Usage fallback.")
        annual_usage_val = None
        annual_pattern = re.compile(r"Annual\s*(?:Historical\s+)?Usage\s*\(?[A-Za-z\s]*\)?\s*[:\-]?\s*([\d,]+)", re.IGNORECASE)
        for section_content in sections.values():
            match = annual_pattern.search(section_content)
            if match:
                try:
                    annual_usage_val = int(match.group(1).replace(',', ''))
                    print(f"    Usage: Found Annual Usage: {annual_usage_val}")
                    break
                except ValueError:
                    pass
        
        if annual_usage_val is not None:
            monthly_avg = round(annual_usage_val / 12)
            all_contract_yyyymm = generate_full_yyyymm_sequence(contract_start_date, contract_end_date)
            if all_contract_yyyymm:
                for yyyymm in all_contract_yyyymm:
                    forecast_usage[yyyymm] = monthly_avg
                notes.append(f"Applied annual usage, distributed as {monthly_avg}/month.")
                print(f"    Usage: Distributed annual usage as {monthly_avg}/month for {len(forecast_usage)} months.")
            else:
                notes.append(f"Found annual usage {annual_usage_val} but could not distribute (no YYYY-MM sequence).")


    contract_data_dict['monthly_forecasted_usage'] = dict(sorted(forecast_usage.items())) # Sort by YYYY-MM
    contract_data_dict['_notes'] = notes

In [290]:
# (Ensure pandas as pd is imported at the top)

def extract_service_account_details(sections, tables, contract_data_dict, markdown_content_for_fallback=""):
    notes = contract_data_dict.get("_notes", [])
    
    service_addresses = set(contract_data_dict.get('service_addresses', [])) # Initialize from existing
    account_numbers = set(contract_data_dict.get('account_numbers', []))
    meter_numbers = set(contract_data_dict.get('meter_numbers', []))
    
    # Rate class can be a string or list, handle carefully
    current_rate_class = contract_data_dict.get('rate_class')
    if isinstance(current_rate_class, str):
        rate_classes = {current_rate_class}
    elif isinstance(current_rate_class, list):
        rate_classes = set(current_rate_class)
    else:
        rate_classes = set()

    # --- Table-Based Extraction ---
    processed_tables_for_details = set() # To avoid processing same table multiple times if called again
    for table_idx, df in enumerate(tables):
        if table_idx in processed_tables_for_details:
            continue
        
        df_columns_normalized = {str(col).lower().replace(' ', '').replace('_', '').replace('.', '').replace('/', '').replace('#',''): col for col in df.columns}
        
        addr_keys = ['serviceaddress', 'facilitynameserviceaddress', 'facilityaddress', 'address']
        acc_keys = ['accountnumber', 'utilityaccountnumber', 'account'] # Removed '#' from key for now
        meter_keys = ['meter', 'meternumber', 'deliverypoint'] # Removed '#'
        rate_class_keys = ['rate', 'rateclass', 'ratecode']

        addr_col_orig, acc_col_orig, meter_col_orig, rate_class_col_orig = None, None, None, None

        for key in addr_keys:
            if key in df_columns_normalized: addr_col_orig = df_columns_normalized[key]; break
        for key in acc_keys:
            if key in df_columns_normalized: acc_col_orig = df_columns_normalized[key]; break
        for key in meter_keys: # Meter # is tricky due to '#'
            if key in df_columns_normalized: meter_col_orig = df_columns_normalized[key]; break
            # Specific check for "Meter #" column if normalization missed it
            if not meter_col_orig:
                for orig_col_name in df.columns:
                    if str(orig_col_name).strip().lower() == "meter #":
                        meter_col_orig = orig_col_name; break
        for key in rate_class_keys:
            if key in df_columns_normalized: rate_class_col_orig = df_columns_normalized[key]; break
        
        if sum(col is not None for col in [addr_col_orig, acc_col_orig, meter_col_orig, rate_class_col_orig]) >= 1:
            print(f"    Details: Processing Table {table_idx} for account/service info (Cols: A:{addr_col_orig}, Acct:{acc_col_orig}, M:{meter_col_orig}, RC:{rate_class_col_orig}).")
            processed_tables_for_details.add(table_idx)
            for _, row in df.iterrows():
                try:
                    if addr_col_orig and pd.notna(row[addr_col_orig]):
                        addr = str(row[addr_col_orig]).strip()
                        if addr and len(addr) > 5: service_addresses.add(addr)
                    
                    if acc_col_orig and pd.notna(row[acc_col_orig]):
                        acc_raw = str(row[acc_col_orig]).strip()
                        acc = re.sub(r'[\s\-]', '', acc_raw) # Remove spaces and hyphens
                        if acc: account_numbers.add(acc) # Add cleaned version
                    
                    if meter_col_orig and pd.notna(row[meter_col_orig]):
                        mtr = str(row[meter_col_orig]).strip()
                        # Basic validation for meter numbers
                        if mtr and len(mtr) > 2 and any(c.isalnum() for c in mtr): 
                            # Avoid adding if it's an exact duplicate of an already found account number
                            # (sometimes meter is listed as account)
                            cleaned_mtr_for_comp = re.sub(r'[\s\-]', '', mtr)
                            if cleaned_mtr_for_comp not in account_numbers:
                                meter_numbers.add(mtr)
                    
                    if rate_class_col_orig and pd.notna(row[rate_class_col_orig]):
                        rc = str(row[rate_class_col_orig]).strip()
                        if rc and len(rc) < 25 and "price" not in rc.lower() and "$" not in rc and "¢" not in rc and "agreement" not in rc.lower():
                            rate_classes.add(rc)
                except KeyError: continue
                except Exception: continue 
    
    # --- Text-Based Fallback Extraction ---
    full_text_search_space = "\n".join(sections.values()) if sections else markdown_content_for_fallback # Use full_markdown_text if sections empty

    if not service_addresses:
        addr_patterns = [
            re.compile(r"(?:Facility Address|Service Address|Physical Business Address)\s*[:\-]?\s*(.+)", re.IGNORECASE)
        ]
        for pattern in addr_patterns:
            for match in pattern.finditer(full_text_search_space):
                addr = match.group(1).split('\n')[0].strip()
                if addr and len(addr) > 5: service_addresses.add(addr); break
            if service_addresses: break 
    
    if not account_numbers:
        acc_patterns = [
            re.compile(r"(?:Account Number\(s\)|Account No|Utility Account Number|ACCOUNT NUMBER)\s*\(?[^)]*\)?\s*[:\-]?\s*([\w\-\s]+)", re.IGNORECASE)
        ]
        for pattern in acc_patterns:
            for match in pattern.finditer(full_text_search_space):
                acc_raw = match.group(1).strip()
                acc = re.sub(r'[\s\-]', '', acc_raw)
                if acc: account_numbers.add(acc); break
            if account_numbers: break
            
    if not meter_numbers:
        meter_patterns = [ re.compile(r"Meter\s*#\s*[:\-]?\s*([\w\-\s\/]+)", re.IGNORECASE) ]
        for pattern in meter_patterns:
            for match in pattern.finditer(full_text_search_space):
                mtr = match.group(1).strip()
                if mtr and len(mtr) > 1 and mtr.lower() != "facility(ies)" and not mtr.lower().startswith("attached exhibit"): 
                     # Check if it might be an account number already found
                    cleaned_mtr_for_comp = re.sub(r'[\s\-]', '', mtr)
                    if cleaned_mtr_for_comp not in account_numbers:
                        meter_numbers.add(mtr)
                        break 
            if meter_numbers: break

    # Rate class from text is harder, tables are primary. Can add patterns if needed.

    contract_data_dict['service_addresses'] = sorted(list(service_addresses))
    contract_data_dict['account_numbers'] = sorted(list(account_numbers))
    contract_data_dict['meter_numbers'] = sorted(list(meter_numbers))
    
    rc_list = sorted(list(rate_classes))
    if len(rc_list) == 1: contract_data_dict['rate_class'] = rc_list[0]
    elif len(rc_list) > 1: contract_data_dict['rate_class'] = rc_list
    else: contract_data_dict['rate_class'] = None
        
    # Print summary
    # if service_addresses: print(f"    Details: Found {len(service_addresses)} service address(es).")
    # if account_numbers: print(f"    Details: Found {len(account_numbers)} account number(s).")
    # if meter_numbers: print(f"    Details: Found {len(meter_numbers)} meter number(s).")
    # if rate_classes: print(f"    Details: Found {len(rate_classes)} rate class(es).")
    if not any([service_addresses, account_numbers, meter_numbers, rate_classes]):
        notes.append("No service/account details found.")
        # print("    Details: No service/account details found.")

    contract_data_dict['_notes'] = notes

In [292]:
def extract_party_and_governing_law(sections, tables, contract_data_dict, full_markdown_text):
    """
    Extracts Seller Name, Buyer Name (Customer), and Governing Law/State.
    Populates relevant fields in contract_data_dict.
    """
    notes = contract_data_dict.get("_notes", [])
    
    seller_name = None
    buyer_name = None # This could be legal entity or DBA
    governing_law_state = None

    # --- Search Space ---
    # Preamble often at the start of the document, not necessarily in a specific "section" if H2s start later.
    # Signature blocks are often at the end.
    # Governing law clause might be its own section or within a general terms section.

    # Use full_markdown_text for broader searches, and sections for targeted ones.
    # Try to identify specific sections first
    
    notice_section_text = None
    governing_law_section_text = None
    preamble_text = full_markdown_text[:2000] # Approx first 2000 chars for preamble

    for section_name, section_content in sections.items():
        if "notices" in section_name.lower():
            notice_section_text = section_content
        elif "governing law" in section_name.lower() or "applicable law" in section_name.lower():
            governing_law_section_text = section_content
        # Signature sections are harder to name consistently, often just "SELLER:", "BUYER:"
        # We'll rely on full_markdown_text for those if not in a specific "Signatures" section.

    # --- Seller and Buyer Name Extraction ---
    # Priority: Look near "SELLER:" and "BUYER:" labels, often in Notices or signature areas.
    # Regex: SELLER:\s*([A-Za-z0-9 .,&()IncLLC]+)
    
    # Pattern to capture text after "SELLER:" or "BUYER Name:" up to newline or common terminators
    # Ex1: Seller, Tiger, Inc. | BUYER Name: Barcelona Rino LLC
    # Ex2: Business Name (legal contracting entity): barcelona waypointe llc
    # Ex4: Customer Name: Barcelona Wine Bar
    
    seller_patterns = [
        re.compile(r"Seller\s*[,:]\s*(?P<name>[A-Za-z0-9\s.,&()IncLLC]+?)(?:,?\s*agrees|\n|hereby)", re.IGNORECASE), # "Seller, Tiger, Inc., agrees" or "Seller: Name"
        re.compile(r"\bSELLER\s*:\s*(?P<name>[A-Z0-9\s.,&()INCLLC]+)(?:\n|Suite J\.|Attn:)", re.MULTILINE), # All caps seller
        re.compile(r"between\s+(?P<name>[A-Za-z0-9\s.,&()IncLLC]+?)\s+\(\s*[\"\']Seller[\"\']\s*\)", re.IGNORECASE) # "between Company (Seller)"
    ]
    buyer_patterns = [
        re.compile(r"Buyer\s*[,:]\s*(?P<name>[A-Za-z0-9\s.,&()IncLLC]+?)(?:,?\s*agrees|\n|Facility Address:)", re.IGNORECASE),
        re.compile(r"\bBUYER Name\s*:\s*(?P<name>[A-Za-z0-9\s.,&()IncLLC]+)", re.IGNORECASE),
        re.compile(r"(?:Customer Name|Business Name.*?):\s*(?P<name>[A-Za-z0-9\s.,&()IncLLC]+)", re.IGNORECASE), # Ex2, Ex4
        re.compile(r"and\s+(?P<name>[A-Za-z0-9\s.,&()IncLLC]+?)\s+\(\s*[\"\']Buyer[\"\']\s*\)", re.IGNORECASE), # "and Company (Buyer)"
        re.compile(r"\bBUYER\s*:\s*(?P<name>[A-Z0-9\s.,&()INCLLC]+)(?:\n|By:)", re.MULTILINE) # All caps buyer
    ]

    search_areas_for_parties = [preamble_text]
    if notice_section_text: search_areas_for_parties.append(notice_section_text)
    search_areas_for_parties.append(full_markdown_text[-2000:]) # Last 2000 chars for signatures

    for text_block in search_areas_for_parties:
        if not seller_name:
            for pattern in seller_patterns:
                match = pattern.search(text_block)
                if match:
                    name_cand = match.group("name").strip().rstrip(',')
                    # Avoid overly short or generic matches like "Seller" itself if it's not followed by more
                    if len(name_cand) > 6 or "Inc" in name_cand or "LLC" in name_cand:
                        seller_name = name_cand
                        print(f"    Parties: Found Seller Name: {seller_name}")
                        break
        if not buyer_name:
            for pattern in buyer_patterns:
                match = pattern.search(text_block)
                if match:
                    name_cand = match.group("name").strip().rstrip(',')
                    if len(name_cand) > 6 or "Inc" in name_cand or "LLC" in name_cand or "barcelona" in name_cand.lower(): # barcelona is specific to examples
                        buyer_name = name_cand
                        print(f"    Parties: Found Buyer Name: {buyer_name}")
                        break
        if seller_name and buyer_name:
            break

    # --- Governing Law Extraction ---
    # Ex1: This contract shall be governed by the laws of the State of Colorado.
    # Ex2: This Agreement shall be governed by the laws of the State of Texas
    gov_law_pattern = re.compile(r"governed by the laws of the State of\s+([A-Za-z\s]+)\.?", re.IGNORECASE)
    
    search_areas_for_law = []
    if governing_law_section_text:
        search_areas_for_law.append(governing_law_section_text)
        print("    GovLaw: Searching in dedicated Governing Law section.")
    search_areas_for_law.append(full_markdown_text) # Search full doc as fallback

    for text_block in search_areas_for_law:
        match = gov_law_pattern.search(text_block)
        if match:
            governing_law_state = match.group(1).strip()
            print(f"    GovLaw: Found Governing Law State: {governing_law_state}")
            break # Found it

    # Update contract_data_dict
    contract_data_dict['seller_name'] = seller_name
    contract_data_dict['buyer_name'] = buyer_name # Could be 'customer_name'
    contract_data_dict['governing_law_state'] = governing_law_state

    if not seller_name: notes.append("Seller name not found.")
    if not buyer_name: notes.append("Buyer/Customer name not found.")
    if not governing_law_state: notes.append("Governing law/state not found.")
    
    contract_data_dict['_notes'] = notes

In [294]:
if __name__ == "__main__":
    all_extracted_data = []
    file_count = 0

    for md_file_path in find_contract_files(BASE_OUTPUT_DIR):
        print(f"\nProcessing: {md_file_path.parent.name}/{md_file_path.name}")
        file_count += 1
        
        markdown_content = read_markdown_file(md_file_path)
        if not markdown_content:
            continue

        contract_id = md_file_path.parent.name
        current_contract_data = {
            "contract_id": contract_id,
            "_source_file": str(md_file_path.relative_to(BASE_OUTPUT_DIR)),
            "_notes": [],
            "seller_name": None, # New
            "buyer_name": None,  # New
            "governing_law_state": None, #New
            "contract_duration": {"start_date": None, "end_date": None},
            "monthly_forecasted_usage": {},
            "contracted_rates": [],
            "unit_of_measurement": None,
            "rate_class": None, 
            "service_addresses": [],
            "account_numbers": [],
            "meter_numbers": []
        }

        parsed_structure = parse_markdown_to_structured_data(markdown_content)
        sections = parsed_structure.get("sections", {})
        tables = parsed_structure.get("tables", [])
        
        print(f"  Found {len(sections)} refined sections: {list(sections.keys())}")
        if tables:
            print(f"  Found {len(tables)} tables (via HTML parsing).")
            for idx, table_df in enumerate(tables):
                print(f"    Table {idx} columns: {table_df.columns.tolist()}")
        else:
            print("  No tables found by HTML parsing.")

        # --- Call Extraction Functions ---
        extract_contract_duration(sections, tables, current_contract_data)
        extract_rates_and_unit(sections, tables, current_contract_data, markdown_content) 
        extract_monthly_usage(sections, tables, current_contract_data)
        extract_service_account_details(sections, tables, current_contract_data, markdown_content) 
        extract_party_and_governing_law(sections, tables, current_contract_data, markdown_content)
        
        # --- Print Summary of Extracted Data for this Contract ---
        print(f"\n--- Summary for {contract_id} ---")
        for key, value in current_contract_data.items():
            if key != "_notes" and key != "_source_file": # Don't print internal keys here
                 print(f"  {key.replace('_', ' ').title()}: {value}")
        if current_contract_data["_notes"]:
            print(f"  Notes: {current_contract_data['_notes']}")
        print("--- End Summary ---")

        all_extracted_data.append(current_contract_data)

    print(f"\n--- Processed {file_count} contract files. ---")

    if all_extracted_data:
        output_summary_file = JSON_OUTPUT_DIR / "hybrid_summary.json"
        try:
            with open(output_summary_file, 'w', encoding='utf-8') as f:
                json.dump(all_extracted_data, f, indent=2, ensure_ascii=False)
            print(f"Successfully saved summary to {output_summary_file}")
        except Exception as e:
            print(f"Error saving summary file: {e}")
    else:
        print("No data extracted to save.")


Searching for contract folders in: /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs

Processing: Example 2 (Contract)/Example 2 (Contract).md
  Successfully read 30822 characters from Example 2 (Contract).md.
  Found 13 refined sections: ['CUSTOMER INFORMATION', 'CONTRACT INFORMATION', 'ACKNOWLEDGMENT AND CONSENT', 'FACILITIES/ACCOUNTS', 'SECTION 1. TERMS OF SERVICE', 'SECTION 2. GENERAL TERMS AND CONDITIONS', 'SECTION 3. DEFINITIONS', 'Service Options', 'Customer Information and Authorization', 'Requestor & Billing Information', 'Additional Locations', 'Supplier/Consultant Information (please print):', 'Operating Company Information (please print):']
  Found 8 tables (via HTML parsing).
    Table 0 columns: ['Field', 'Details']
    Table 1 columns: ['Signature', 'Customer', 'ENGIE']
    Table 2 columns: ['NO.', 'FACILITY NAME/ SERVICE ADDRESS', 'CITY, STATE, ZIP', 'UTILITY', 'DELIVERY POINT', 'ACCOUNT NUMBER']
    Table 3 columns: ['1.BA 51823187091', 'SA 211953002'