In [8]:
import json
import os
from pathlib import Path
import re # Import regex module for later use

# Define the base directory where the example folders are located
BASE_OUTPUT_DIR = Path("/Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs")

In [10]:
# List to hold the results for all contracts
all_contract_data = []

def process_contracts(base_dir):
    """
    Finds, reads, and prepares to process contract markdown files.
    """
    print(f"Searching for contract folders in: {base_dir.resolve()}")

    # Iterate through items in the base directory
    for item in base_dir.iterdir():
        # Check if it's a directory and seems like an example folder
        # (Adjust the naming check if needed)
        if item.is_dir() and '(Contract)' in item.name:
            contract_folder = item
            print(f"\nFound contract folder: {contract_folder.name}")

            # --- Find the markdown file ---
            md_files = list(contract_folder.glob('*.md'))

            if not md_files:
                print(f"  Warning: No .md file found in {contract_folder.name}")
                continue
            elif len(md_files) > 1:
                print(f"  Warning: Multiple .md files found in {contract_folder.name}. Using the first one: {md_files[0].name}")

            md_file_path = md_files[0]
            print(f"  Processing markdown file: {md_file_path.name}")

            # --- Read the markdown content ---
            try:
                with open(md_file_path, 'r', encoding='utf-8') as f:
                    markdown_content = f.read()
                # print(f"  Successfully read {len(markdown_content)} characters.")
                # Limit printing content for brevity during development
                # print("-" * 20 + " Start Content Sample " + "-" * 20)
                # print(markdown_content[:500] + "...") # Print first 500 chars
                # print("-" * 20 + " End Content Sample " + "-" * 20)


            except Exception as e:
                print(f"  Error reading file {md_file_path}: {e}")
                continue

            # --- Prepare data structure ---
            # Use the folder name (or part of it) as a preliminary contract ID
            contract_id = contract_folder.name # Or derive a cleaner ID if needed

            contract_data = {
                "contract_id": contract_id,
                "customer_dba_name": None,
                "service_addresses": [],
                "account_numbers": [],
                "meter_numbers": [],
                "contract_duration": {"start_date": None, "end_date": None},
                "rate_class": None,
                "contracted_rates": [],
                "unit_of_measurement": None,
                "monthly_forecasted_usage": {},
                "_source_file": str(md_file_path.relative_to(base_dir)) # Keep track of source
            }

            # --- Placeholder for future extraction steps ---
            # extract_data(markdown_content, contract_data) # We'll build this function

            all_contract_data.append(contract_data)
            print(f"  Prepared basic structure for contract: {contract_id}")

    return all_contract_data

In [12]:
# --- Main execution ---
if __name__ == "__main__":
    if not BASE_OUTPUT_DIR.exists():
        print(f"Error: Base directory '{BASE_OUTPUT_DIR}' does not exist.")
    else:
        extracted_data_list = process_contracts(BASE_OUTPUT_DIR)
        print(f"\nProcessed {len(extracted_data_list)} contract folders.")

        # Optional: Print the basic structures created
        # for data in extracted_data_list:
        #     print(json.dumps(data, indent=2))

        # Next steps will involve populating the fields in 'contract_data'
        # by parsing 'markdown_content'.

Searching for contract folders in: /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs

Found contract folder: Example 2 (Contract)
  Processing markdown file: Example 2 (Contract).md
  Prepared basic structure for contract: Example 2 (Contract)

Found contract folder: Example 3 (Contract)

Found contract folder: Example 1 (Contract)
  Processing markdown file: Example 1 (Contract).md
  Prepared basic structure for contract: Example 1 (Contract)

Found contract folder: Example 4 (Contract)
  Processing markdown file: Example 4 (Contract).md
  Prepared basic structure for contract: Example 4 (Contract)

Processed 3 contract folders.


In [14]:
from datetime import datetime
from dateutil.relativedelta import relativedelta # For adding months to dates
import pandas as pd # Using pandas for robust date parsing

def normalize_date(date_str):
    """Attempts to parse a date string into YYYY-MM-DD format."""
    if not date_str:
        return None
    try:
        # Use pandas to_datetime which is flexible with formats
        dt = pd.to_datetime(date_str)
        return dt.strftime('%Y-%m-%d')
    except (ValueError, TypeError):
        print(f"  Warning: Could not parse date: {date_str}")
        return None

def calculate_end_date(start_date_str, months):
    """Calculates end date given start date string and number of months."""
    start_date = pd.to_datetime(start_date_str)
    # Subtract one day from the date after adding months to get the last day of the period
    end_date = start_date + relativedelta(months=months) - relativedelta(days=1)
    return end_date.strftime('%Y-%m-%d')


def extract_contract_duration(markdown_content, contract_data):
    """
    Extracts the contract start and end dates from markdown content.
    """
    start_date = None
    end_date = None
    months = None

    # --- Regex Patterns ---
    # Pattern 1: Explicit Start and End Dates (various formats)
    # Covers "Begin: MM/DD/YYYY End: MM/DD/YYYY", "Start Date: ... End Date: ...", etc.
    pattern1 = re.compile(
        r"(?:Begin|Start Date|Effective Date|Start)\s*[:\-]?\s*([\d/.-]+|\w+\s\d{1,2},\s\d{4})\s*"
        r"(?:End|End Date)\s*[:\-]?\s*([\d/.-]+|\w+\s\d{1,2},\s\d{4})",
        re.IGNORECASE
    )

    # Pattern 2: Delivery Period Keyword (similar structure)
    pattern2 = re.compile(
        r"Delivery Period\s*[:\-]?\s*Begin\s*[:\-]?\s*([\d/.-]+|\w+\s\d{1,2},\s\d{4})\s*"
        r"End\s*[:\-]?\s*([\d/.-]+|\w+\s\d{1,2},\s\d{4})",
        re.IGNORECASE
    )

    # Pattern 3: "term of XX Months" potentially near a start date
    pattern3_term = re.compile(r"term of\s+(\d+)\s+Months", re.IGNORECASE)
    pattern3_start = re.compile(
        r"(?:on or after|effective|starting|Begin)\s*[:\-]?\s*([\d/.-]+|\w+\s\d{1,2},\s\d{4})",
        re.IGNORECASE
    )

    # Pattern 4: Specific structure from Example 3 Contract doc (page 39)
    pattern4 = re.compile(r"on or after\s+([\w\s\d,]+)\s+and will continue for a term of\s+(\d+)\s+Months", re.IGNORECASE)

    # Pattern 5: Specific structure from Example 4 Contract doc (page 21)
    pattern5 = re.compile(r"DELIVERY PERIOD\s+Begin\s*[:\-]?\s*([\d/.-]+)\s+End\s*[:\-]?\s*([\d/.-]+)", re.IGNORECASE)


    # --- Search using patterns ---
    match = pattern1.search(markdown_content)
    if not match:
        match = pattern2.search(markdown_content)
    if not match:
        match = pattern5.search(markdown_content) # Check specific pattern 5

    if match:
        start_date = normalize_date(match.group(1).strip())
        end_date = normalize_date(match.group(2).strip())
        print(f"  Duration Found (Pattern 1/2/5): Start={start_date}, End={end_date}")
    else:
        # Try pattern 4
        match4 = pattern4.search(markdown_content)
        if match4:
            start_date = normalize_date(match4.group(1).strip())
            months = int(match4.group(2))
            print(f"  Duration Found (Pattern 4): Start={start_date}, Months={months}")
        else:
            # Try pattern 3 (Term + Separate Start Date) - search near term
            match3_term = pattern3_term.search(markdown_content)
            if match3_term:
                months = int(match3_term.group(1))
                # Search for start date within a reasonable range around the term match
                search_range = markdown_content[max(0, match3_term.start() - 100):match3_term.end() + 100]
                match3_start = pattern3_start.search(search_range)
                if match3_start:
                    start_date = normalize_date(match3_start.group(1).strip())
                    print(f"  Duration Found (Pattern 3): Start={start_date}, Months={months}")
                else:
                    print(f"  Duration Found (Pattern 3): Months={months}, but Start Date not found nearby.")
            else:
                 print("  Contract duration pattern not found.")


    # --- Calculate End Date if only Start and Months are found ---
    if start_date and months and not end_date:
        try:
            end_date = calculate_end_date(start_date, months)
            print(f"  Calculated End Date: {end_date}")
        except Exception as e:
            print(f"  Warning: Could not calculate end date from start {start_date} and {months} months: {e}")


    # --- Update contract_data ---
    if start_date or end_date:
        contract_data['contract_duration']['start_date'] = start_date
        contract_data['contract_duration']['end_date'] = end_date
    elif months:
         # Store months if dates are missing, maybe add a note
         contract_data['contract_duration']['duration_months'] = months
         print(f"  Stored duration in months: {months}")


# (Keep the process_contracts function from the previous step, but add the call)

def process_contracts(base_dir):
    """
    Finds, reads, and processes contract markdown files.
    """
    print(f"Searching for contract folders in: {base_dir.resolve()}")
    processed_files_count = 0

    # Iterate through items in the base directory
    for item in base_dir.iterdir():
        # Check if it's a directory and seems like an example folder
        if item.is_dir() and '(Contract)' in item.name:
            contract_folder = item
            print(f"\nFound contract folder: {contract_folder.name}")

            # --- Find the markdown file ---
            md_files = list(contract_folder.glob('*.md'))

            if not md_files:
                print(f"  Warning: No .md file found in {contract_folder.name}")
                continue
            elif len(md_files) > 1:
                print(f"  Warning: Multiple .md files found in {contract_folder.name}. Using the first one: {md_files[0].name}")

            md_file_path = md_files[0]
            print(f"  Processing markdown file: {md_file_path.name}")

            # --- Read the markdown content ---
            try:
                with open(md_file_path, 'r', encoding='utf-8') as f:
                    markdown_content = f.read()
                if not markdown_content.strip(): # Check if file is empty
                     print("  Warning: Markdown file is empty.")
                     # Still create a basic entry, but don't try to parse
                     contract_id = contract_folder.name
                     contract_data = {
                         "contract_id": contract_id,
                         "customer_dba_name": None, # Initialize all fields
                         "service_addresses": [],
                         "account_numbers": [],
                         "meter_numbers": [],
                         "contract_duration": {"start_date": None, "end_date": None},
                         "rate_class": None,
                         "contracted_rates": [],
                         "unit_of_measurement": None,
                         "monthly_forecasted_usage": {},
                         "_source_file": str(md_file_path.relative_to(base_dir)),
                         "_notes": ["Markdown file was empty or whitespace only"]
                     }
                     all_contract_data.append(contract_data)
                     processed_files_count += 1
                     continue # Skip to next folder

                print(f"  Successfully read {len(markdown_content)} characters.")

            except Exception as e:
                print(f"  Error reading file {md_file_path}: {e}")
                continue

            # --- Prepare data structure ---
            contract_id = contract_folder.name
            contract_data = {
                "contract_id": contract_id,
                "customer_dba_name": None,
                "service_addresses": [],
                "account_numbers": [],
                "meter_numbers": [],
                "contract_duration": {"start_date": None, "end_date": None},
                "rate_class": None,
                "contracted_rates": [],
                "unit_of_measurement": None,
                "monthly_forecasted_usage": {},
                 "_source_file": str(md_file_path.relative_to(base_dir)),
                 "_notes": [] # Add a list for potential notes
            }

            # --- Extract data ---
            extract_contract_duration(markdown_content, contract_data)
            # Add calls to other extraction functions here later
            # extract_usage_forecast(markdown_content, contract_data)
            # extract_unit_and_rate(markdown_content, contract_data)
            # ... etc

            all_contract_data.append(contract_data)
            processed_files_count += 1
            # print(f"  Updated structure for contract: {contract_id}")
            # print(json.dumps(contract_data['contract_duration'], indent=2))


    print(f"\nProcessed {processed_files_count} contract files.")
    return all_contract_data

# --- Main execution ---
if __name__ == "__main__":
    if not BASE_OUTPUT_DIR.exists():
        print(f"Error: Base directory '{BASE_OUTPUT_DIR}' does not exist.")
    else:
        extracted_data_list = process_contracts(BASE_OUTPUT_DIR)

        # Optional: Print the final list of extracted data
        # print("\n--- Final Extracted Data ---")
        # print(json.dumps(extracted_data_list, indent=2))

        # Save to a JSON file
        output_file = BASE_OUTPUT_DIR / "extracted_contracts_summary.json"
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(extracted_data_list, f, indent=2, ensure_ascii=False)
            print(f"\nSuccessfully saved summary to {output_file}")
        except Exception as e:
            print(f"\nError saving summary file: {e}")

Searching for contract folders in: /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs

Found contract folder: Example 2 (Contract)
  Processing markdown file: Example 2 (Contract).md
  Successfully read 30822 characters.
  Contract duration pattern not found.

Found contract folder: Example 3 (Contract)

Found contract folder: Example 1 (Contract)
  Processing markdown file: Example 1 (Contract).md
  Successfully read 17353 characters.
  Contract duration pattern not found.

Found contract folder: Example 4 (Contract)
  Processing markdown file: Example 4 (Contract).md
  Successfully read 39333 characters.
  Contract duration pattern not found.

Processed 3 contract files.

Successfully saved summary to /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs/extracted_contracts_summary.json


In [38]:
from collections import defaultdict
import calendar

def generate_year_month_keys(start_date_str, end_date_str):
    """Generates a list of 'YYYY-MM' keys between start and end dates."""
    keys = []
    try:
        start_date = pd.to_datetime(start_date_str)
        end_date = pd.to_datetime(end_date_str)
        current_date = start_date
        while current_date <= end_date:
            keys.append(current_date.strftime('%Y-%m'))
            # Move to the first day of the next month
            current_date = (current_date.replace(day=1) + relativedelta(months=1))
        # Handle edge case where duration might be very short or dates are weird
        if start_date.strftime('%Y-%m') not in keys and start_date <= end_date:
             keys.insert(0, start_date.strftime('%Y-%m'))
        if end_date.strftime('%Y-%m') not in keys and start_date <= end_date:
             # Only add if the month hasn't already been advanced past
             last_key_date = pd.to_datetime(keys[-1]+'-01') if keys else start_date.replace(day=1) - relativedelta(months=1)
             if end_date >= last_key_date + relativedelta(months=1):
                  keys.append(end_date.strftime('%Y-%m'))

    except Exception as e:
        print(f"  Warning: Could not generate date keys from {start_date_str} to {end_date_str}: {e}")
    return keys

def map_month_name_to_number(month_name):
    """Converts month name (e.g., 'Jan', 'January') to month number (1-12)."""
    month_name = month_name.strip().lower()[:3]
    month_map = {name.lower()[:3]: num for num, name in enumerate(calendar.month_abbr) if num > 0}
    return month_map.get(month_name)

# Assume normalize_date, calculate_end_date, pandas (pd), re, etc. are available


def extract_usage_forecast(markdown_content, contract_data):
    """
    Extracts the monthly forecasted usage data, focusing on table structures
    and validating matches. Relies on contract_duration being populated.
    """
    usage_data = defaultdict(int)
    monthly_values_raw = [] # Store tuples of (month_num, value) or (year_month_str, value)
    found_monthly_data = False
    notes = contract_data.get("_notes", [])

    # --- Section Identification (Optional but recommended for large docs) ---
    # Look for lines indicating start of usage sections
    usage_section_headers = [
        "Performance Obligation",
        "Contract Quantity",
        "Monthly Quantity",
        "Baseload Volume per Month" # Added from Example 1
    ]
    # Find lines containing these headers (case-insensitive)
    section_starts = []
    for header in usage_section_headers:
        try:
            # Find all occurrences, store line number or index
             for match in re.finditer(rf"^\s*\#*.*{re.escape(header)}.*$", markdown_content, re.IGNORECASE | re.MULTILINE):
                 section_starts.append(match.start())
        except re.error as e:
            print(f"  Regex error looking for header '{header}': {e}")
            continue # Skip header if regex is invalid

    search_text = markdown_content # Default to searching whole document

    # If sections found, we could potentially narrow down search_text,
    # but parsing blocks accurately can be complex. For now, we'll just use
    # the header presence as a stronger indicator later.
    if section_starts:
        print(f"  Identified potential usage section header(s): {usage_section_headers}")
    else:
        print("  No specific usage section headers found, searching document.")


    # --- Revised Regex Patterns (Stricter Table Focus) ---
    month_pattern_str = r"(?P<month>Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)"
    # Value pattern: integer, allowing commas
    value_pattern_str = r"(?P<value>[\d,]+)"

    # Pattern 1: Explicit Markdown Table Row | Month | Value |
    # Requires pipes, allows optional spacing.
    table_row_pattern_month_first = re.compile(
        r"^\s*\|\s*" + month_pattern_str + r"\s*\|\s*" + value_pattern_str + r"\s*\|.*$",
        re.IGNORECASE | re.MULTILINE
    )
    # Pattern 2: Explicit Markdown Table Row | Value | Month |
    table_row_pattern_value_first = re.compile(
        r"^\s*\|\s*" + value_pattern_str + r"\s*\|\s*" + month_pattern_str + r"\s*\|.*$",
        re.IGNORECASE | re.MULTILINE
    )
    # Pattern 3: Less strict, potentially space-separated, but month MUST be followed by number relatively closely
    # Looks for MonthName followed by space(s) then Number, typical in simpler lists/tables
    # This is riskier, use carefully
    spaced_pattern_month_first = re.compile(
        r"^\s*" + month_pattern_str + r"\s+" + value_pattern_str + r"\s*$", # Anchored start/end
        re.IGNORECASE | re.MULTILINE
    )


    # --- Search for Usage Data using refined patterns ---
    all_matches = []
    all_matches.extend(table_row_pattern_month_first.finditer(search_text))
    all_matches.extend(table_row_pattern_value_first.finditer(search_text))
    # Only add spaced pattern if pipe patterns found nothing or very little
    if not all_matches:
         print("  Pipe table pattern yielded no results, trying spaced pattern (less reliable).")
         all_matches.extend(spaced_pattern_month_first.finditer(search_text))

    processed_matches_count = 0
    for match in all_matches:
        month_name = match.group("month")
        value_str = match.group("value").replace(',', '')

        # --- VALIDATION ---
        month_num = map_month_name_to_number(month_name)
        value = None
        if month_num:
            try:
                value = int(value_str)
            except (ValueError, TypeError):
                # Value is not a valid integer, ignore this match
                # print(f"  Debug: Invalid value '{value_str}' for month '{month_name}' in match: {match.group(0)}") # Debug line
                continue # Skip to next match

        if month_num and value is not None:
            # Valid match found!
            monthly_values_raw.append((month_num, value))
            found_monthly_data = True
            processed_matches_count += 1
            # print(f"  Validated usage: {month_name} ({month_num}) -> {value}") # Debug line
        #else:
            # print(f"  Debug: Invalid month '{month_name}' or value '{value_str}' in match: {match.group(0)}") # Debug line


    print(f"  Found {processed_matches_count} potential & validated monthly usage data points.")

    # --- Process Found Data (Logic remains similar, but now uses validated data) ---
    if found_monthly_data:
        print("  Processing validated monthly usage data (assigning years).")
        start_date_str = contract_data['contract_duration'].get('start_date')
        end_date_str = contract_data['contract_duration'].get('end_date')

        if start_date_str and end_date_str:
            all_keys = generate_year_month_keys(start_date_str, end_date_str)
            if not all_keys:
                 notes.append("Could not generate date keys for usage mapping despite having start/end dates.")
                 print("  Error: Could not generate date keys for usage mapping.")
            else:
                month_to_value = defaultdict(int)
                # Aggregate potentially multiple tables/locations
                for m, v in monthly_values_raw:
                    month_to_value[m] += v

                num_unique_months_found = len(month_to_value)
                print(f"  Aggregated data for {num_unique_months_found} unique months.")

                if num_unique_months_found == 12:
                    print(f"  Applying 12-month usage pattern across {len(all_keys)} months.")
                    for key in all_keys:
                        year, month_num = map(int, key.split('-'))
                        usage_data[key] = month_to_value.get(month_num, 0)
                elif num_unique_months_found > 0:
                     # Determine sequence based on actual found month numbers
                     month_sequence = sorted(month_to_value.keys())
                     values_in_order = [month_to_value[m] for m in month_sequence]

                     print(f"  Mapping found {num_unique_months_found} months cyclically across {len(all_keys)} month duration.")
                     notes.append(f"Applied pattern for {num_unique_months_found} months over {len(all_keys)} month duration (cyclical if needed).")
                     if len(all_keys) > 0 and num_unique_months_found > 0:
                        for i, key in enumerate(all_keys):
                            usage_data[key] = values_in_order[i % num_unique_months_found] # Cycle through found values
                     else:
                           notes.append(f"Cannot apply cyclical pattern: num_keys={len(all_keys)}, num_months={num_unique_months_found}")

                else: # num_unique_months_found == 0
                    notes.append("Validated matches, but resulted in 0 unique months of data.")
                    print("  Warning: Validated matches, but resulted in 0 unique months.")

        else:
            notes.append("Cannot map monthly usage without contract start/end dates.")
            print("  Warning: Cannot assign years to monthly usage without contract start/end dates.")

    # --- Handle Annual Usage (Fallback - Logic remains the same) ---
    if not found_monthly_data:
        print("  No monthly usage table found. Checking for annual usage...")
        # (Keep the annual usage pattern search logic from before)
        annual_pattern = re.compile(r"Annual\s+(?:Historical\s+)?Usage\s*(?:\(.*\))?\s*[:\-]?\s*([\d,]+)", re.IGNORECASE)
        annual_match = annual_pattern.search(markdown_content)
        if annual_match:
            try:
                annual_value = int(annual_match.group(1).replace(',', ''))
                monthly_estimate = round(annual_value / 12)
                print(f"  Found Annual Usage: {annual_value}. Estimating monthly as: {monthly_estimate}")

                start_date_str = contract_data['contract_duration'].get('start_date')
                end_date_str = contract_data['contract_duration'].get('end_date')

                if start_date_str and end_date_str:
                    all_keys = generate_year_month_keys(start_date_str, end_date_str)
                    if all_keys:
                        for key in all_keys:
                            usage_data[key] = monthly_estimate
                        notes.append(f"Estimated monthly usage ({monthly_estimate}) from annual total ({annual_value}).")
                    else:
                         notes.append(f"Found annual usage ({annual_value}) but could not generate date keys to distribute.")
                else:
                    notes.append(f"Found annual usage ({annual_value}) but cannot distribute without contract start/end dates.")

            except ValueError:
                print(f"  Warning: Could not parse annual usage value: {annual_match.group(1)}")
            except Exception as e:
                print(f"  Error processing annual usage: {e}")
        else:
            print("  No annual usage value found either.")
            notes.append("No monthly or annual usage data found.")

    # --- Final Update ---
    contract_data['monthly_forecasted_usage'] = dict(sorted(usage_data.items())) # Sort by YYYY-MM
    contract_data['_notes'] = notes


# Assume other necessary imports and helper functions are available

# Assume other necessary imports and helper functions are available

def extract_unit_and_rate(markdown_content, contract_data):
    """
    Extracts the contracted rate(s) and the primary unit of measurement.
    Revised pattern for better markdown handling.
    """
    rates = []
    unit = None
    notes = contract_data.get("_notes", [])

    # --- Revised Regex Patterns ---

    # Pattern 1: Handle labels like **Fixed Price**:, Contract price ($/kwh): etc.
    # More specific units, allows optional markdown **, flexible spacing, named groups
    rate_pattern_markdown = re.compile(
        r"\*{0,2}(?:Contract price|Fixed Price)\*{0,2}\s*[:\-(]?\s*" # Label with optional **, :, -, or (
        r"(?P<currency>[\$¢])?"                                     # Optional currency symbol (Group 'currency')
        r"(?P<value>[\d,]+(?:\.\d+)?)"                              # Price value (Group 'value')
        r"\s*(?:\)?\s*(?:per|\/)\s*)?"                              # Optional closing ), separator per or / (optional)
        r"(?P<unit>KWh|MMBTU|Dth|Therm)\b",                         # Specific Unit (Group 'unit'), word boundary
        re.IGNORECASE
    )

    # Pattern 2: Cents pattern (revised for flexibility)
    cents_rate_pattern = re.compile(
        r"(?P<value>[\d,]+(?:\.\d+)?)\s+\(?[Cc]ents?[\/ ]KWh\)?", # Allow optional (), space or /
        re.IGNORECASE
    )

    # Pattern 3: Unit context pattern (remains same, but add named group)
    unit_context_pattern = re.compile(
        r"(?:Volume|Quantity)\s+(?:per\s+Month\s+)?in\s+(?P<unit>Dths|MMBTUs|kWh|Therms)\b",
        re.IGNORECASE
    )


    # --- Search for Rate and Unit ---
    found_rate = False
    processed_matches = set() # Keep track of matched text to avoid duplicates from overlapping patterns

    # Try Pattern 1
    for match in rate_pattern_markdown.finditer(markdown_content):
        match_text = match.group(0)
        if match_text in processed_matches: continue # Skip if already processed
        processed_matches.add(match_text)

        price_str = match.group("value").replace(',', '')
        unit_str = match.group("unit").upper()
        currency_symbol = match.group("currency")

        # Basic unit standardization
        if unit_str == "DTHS": unit_str = "DTH"
        elif unit_str == "THERMS": unit_str = "THERM"
        elif unit_str == "MMBTUS": unit_str = "MMBTU"
        # KWH is fine

        try:
            price_val = float(price_str)
            rate_unit_str = f"$/{unit_str}" # Default to dollars

            if currency_symbol == '¢':
                price_val = price_val / 100.0 # Convert cents to dollars
                print(f"  Found cents rate (pattern 1): {match.group('value')} {currency_symbol}/{unit_str} -> {price_val:.5f} $/ {unit_str}")
            else: # Assume dollars if $ or no symbol
                 print(f"  Found rate (pattern 1): {match.group('value')} {currency_symbol or '$'}/{unit_str}")


            rate_entry = {
                "rate": f"{price_val:.5f}" if currency_symbol == '¢' else str(price_val), # Store consistently
                "unit": rate_unit_str
            }

            # Avoid adding duplicate rates (same value and unit)
            if rate_entry not in rates:
                 rates.append(rate_entry)
            if not unit: unit = unit_str # Assign first found unit as primary
            found_rate = True

        except ValueError:
            print(f"  Warning: Could not parse rate value: {match.group('value')}")
        except Exception as e:
             print(f"  Error processing rate match {match.group(0)}: {e}")


    # Try Pattern 2 (Cents pattern) only if Pattern 1 found nothing
    if not found_rate:
        for match in cents_rate_pattern.finditer(markdown_content):
            match_text = match.group(0)
            if match_text in processed_matches: continue
            processed_matches.add(match_text)

            price_str = match.group("value").replace(',', '')
            unit_str = "KWH" # Pattern implies kWh

            try:
                price_val_cents = float(price_str)
                price_val_dollars = price_val_cents / 100.0
                rate_entry = {
                    "rate": f"{price_val_dollars:.5f}", # Store as dollar value string
                    "unit": f"$/{unit_str}"
                }
                if rate_entry not in rates:
                    rates.append(rate_entry)
                if not unit: unit = unit_str
                found_rate = True
                print(f"  Found cents rate (pattern 2): {price_val_cents} cents/KWh -> {price_val_dollars:.5f} $/KWH")

            except ValueError:
                print(f"  Warning: Could not parse cents rate value: {match.group('value')}")
            except Exception as e:
                 print(f"  Error processing cents rate match {match.group(0)}: {e}")


    # --- Determine Unit if not found via rate ---
    if not unit:
        print("  Unit not found with rate, checking context...")
        match = unit_context_pattern.search(markdown_content)
        if match:
            unit_str_context = match.group("unit").upper()
            # Standardize from context
            if unit_str_context in ["DTHS", "DTH"]: unit = "DTH"
            elif unit_str_context in ["THERMS"]: unit = "THERM"
            elif unit_str_context == "KWH": unit = "KWH"
            elif unit_str_context in ["MMBTUS", "MMBTU"]: unit = "MMBTU"

            if unit:
                 print(f"  Found unit from context: {unit}")
                 notes.append(f"Unit '{unit}' inferred from context (e.g., usage table header).")
            else:
                 print(f"  Found context unit '{match.group('unit')}' but couldn't standardize.")
                 notes.append(f"Could not standardize unit '{match.group('unit')}' found from context.")


    # --- Final Unit Check and Notes ---
    if not unit:
        # Fallback inference based on keywords (keep this logic)
        if "ELECTRIC" in markdown_content.upper():
             unit = "KWH"
             notes.append("Inferred unit KWH based on 'ELECTRIC' keyword in document.")
             print("  Inferred unit KWH based on 'ELECTRIC' keyword.")
        elif "GAS" in markdown_content.upper() or "NATURAL GAS" in markdown_content.upper():
             unit = "MMBTU" # Default guess for gas based on Example 4
             notes.append("Inferred unit MMBTU based on 'GAS' keyword in document (could be THERM/DTH).")
             print("  Inferred unit MMBTU based on 'GAS' keyword (could be THERM/DTH).")
        else:
             notes.append("Could not determine unit of measurement.")
             print("  Warning: Could not determine unit of measurement.")

    if not rates:
        notes.append("Could not find any contracted rate information.")
        print("  Warning: Could not find any contracted rate information.")

    # --- Update contract_data ---
    contract_data['contracted_rates'] = rates
    contract_data['unit_of_measurement'] = unit
    contract_data['_notes'] = notes


# --- process_contracts function remains the same ---
# --- Main execution remains the same ---


In [40]:
# --- Update process_contracts to call the new function ---
# (Make sure this call happens *after* extract_contract_duration)

def process_contracts(base_dir):
    """ Finds, reads, and processes contract markdown files. """
    print(f"Searching for contract folders in: {base_dir.resolve()}")
    processed_files_count = 0
    all_contract_data = [] # Initialize list here

    for item in base_dir.iterdir():
        if item.is_dir() and '(Contract)' in item.name:
            contract_folder = item
            print(f"\nFound contract folder: {contract_folder.name}")
            md_files = list(contract_folder.glob('*.md'))

            if not md_files: # ... (handling for no/multiple md files)
                print(f"  Warning: No .md file found in {contract_folder.name}")
                continue
            md_file_path = md_files[0]
            print(f"  Processing markdown file: {md_file_path.name}")

            try: # ... (reading file content)
                 with open(md_file_path, 'r', encoding='utf-8') as f:
                    markdown_content = f.read()
                 if not markdown_content.strip():
                     print("  Warning: Markdown file is empty.")
                     # Create basic entry
                     contract_id = contract_folder.name
                     contract_data = { # Initialize all fields
                        "contract_id": contract_id,"customer_dba_name": None, "service_addresses": [], "account_numbers": [], "meter_numbers": [],
                        "contract_duration": {"start_date": None, "end_date": None}, "rate_class": None, "contracted_rates": [],
                        "unit_of_measurement": None, "monthly_forecasted_usage": {},
                        "_source_file": str(md_file_path.relative_to(base_dir)), "_notes": ["Markdown file was empty or whitespace only"]
                     }
                     all_contract_data.append(contract_data)
                     processed_files_count += 1
                     continue
                 print(f"  Successfully read {len(markdown_content)} characters.")
            except Exception as e: # ... (error handling)
                print(f"  Error reading file {md_file_path}: {e}")
                continue

            # --- Prepare data structure ---
            contract_id = contract_folder.name
            contract_data = { # Initialize all fields
                "contract_id": contract_id,"customer_dba_name": None, "service_addresses": [], "account_numbers": [], "meter_numbers": [],
                "contract_duration": {"start_date": None, "end_date": None}, "rate_class": None, "contracted_rates": [],
                "unit_of_measurement": None, "monthly_forecasted_usage": {},
                "_source_file": str(md_file_path.relative_to(base_dir)), "_notes": []
            }

            # --- Extract data (Duration first, then Usage) ---
            extract_contract_duration(markdown_content, contract_data)
            extract_usage_forecast(markdown_content, contract_data)
            extract_unit_and_rate(markdown_content, contract_data) # <<< Added call
            # ... add other extraction calls later

            all_contract_data.append(contract_data)
            processed_files_count += 1
            # Optional: print details for debugging
            # print(f"  Forecast for {contract_id}: {contract_data['monthly_forecasted_usage']}")
            # print(f"  Notes for {contract_id}: {contract_data['_notes']}")


    print(f"\nProcessed {processed_files_count} contract files.")
    return all_contract_data

# --- Main execution (remains the same) ---
if __name__ == "__main__":
    if not BASE_OUTPUT_DIR.exists():
        print(f"Error: Base directory '{BASE_OUTPUT_DIR}' does not exist.")
    else:
        extracted_data_list = process_contracts(BASE_OUTPUT_DIR)
        output_file = BASE_OUTPUT_DIR / "extracted_contracts_summary.json"
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(extracted_data_list, f, indent=2, ensure_ascii=False)
            print(f"\nSuccessfully saved summary to {output_file}")
        except Exception as e:
            print(f"\nError saving summary file: {e}")

Searching for contract folders in: /Users/josephndigiovanni/Downloads/UChicago/SP24/Capstone 1/outputs

Found contract folder: Example 2 (Contract)
  Processing markdown file: Example 2 (Contract).md
  Successfully read 30822 characters.
  Found Start Date: 2022-11-01 (using pattern: CONTRACT INFORMATION.*?\n\s*\*{1,2}Start Date:\*{1...)
  Found End Date: 2026-10-31 (using pattern: CONTRACT INFORMATION.*?\n\s*\*{1,2}End Date:\*{1,2...)
  No specific usage section headers found, searching document.
  Pipe table pattern yielded no results, trying spaced pattern (less reliable).
  Found 0 potential & validated monthly usage data points.
  No monthly usage table found. Checking for annual usage...
  No annual usage value found either.
  Unit not found with rate, checking context...
  Inferred unit KWH based on 'ELECTRIC' keyword.

Found contract folder: Example 3 (Contract)

Found contract folder: Example 1 (Contract)
  Processing markdown file: Example 1 (Contract).md
  Successfully read 