# Pre-processing MMFT data for the 4-Growth Platform

## Setup and Imports 

### Import Libraries

In [None]:
from pathlib import Path

import pandas as pd

### Variables

In [None]:
# Local directory for raw data files (Excel sheets)
INPUT_DATA_DIR = Path("../data/raw/D3.3/")
OUTPUT_DATA_DIR = Path("../data/processed/D3.3/")

# Forestry MMFT sheet names
FORESTRY_MMFT_SHEET_NAMES = {
    "baseline": "Model - Baseline (VZZ)",
    "reimagining_progress": "Scenario - Reimagining Progress",
    "fractured_continent": "Scenario - Fractured Continent",
    "corporate_epoch": "Scenario - Corporate Epoch",
}

# Agriculture MMFT sheet names
AGRICULTURE_MMFT_SHEET_NAMES = {
    "baseline": "Model - Baseline",
    "reimagining_progress": "Scenario - Reimagining Progress",
    "fractured_continent": "Scenario - Fractured Continent",
    "corporate_epoch": "Scenario - Corporate Epoch",
}

### Utility Functions

In [None]:
# Function to split dataframe by sections (A-G markers in column 1)
def split_by_sections(df_raw: pd.DataFrame) -> dict[str, pd.DataFrame]:  # noqa: PLR0912
    """
    Split raw dataframe into separate dataframes for each section (A-G).

    Data has sections stacked vertically with markers in column 1:
    - A. Market potential
    - B. Addressable market
    - C. Penetration
    - D. Shipments
    - E. Installed base
    - F. Prices
    - G. Revenues

    Returns:
        dict mapping section names to their respective DataFrames
    """
    # Define section patterns and their names
    section_markers = {
        "A. Market": "market_potential",
        "B. Addressable": "addressable_market",
        "C. Penetration": "penetration",
        "D. Shipments": "shipments",
        "E. Installed": "installed_base",
        "F. Prices": "prices",
        "G. Revenues": "revenues",
    }

    # Find row indices where each section starts - check multiple columns
    section_starts = []
    for row_idx in range(len(df_raw)):
        # Check columns 0, 1, and 2 for section markers
        for col_idx in range(min(3, len(df_raw.columns))):
            val = df_raw.iloc[row_idx, col_idx]
            if isinstance(val, str):
                for marker, section_name in section_markers.items():
                    if marker in val:
                        section_starts.append((row_idx, section_name))
                        break
                else:
                    continue
                break

    # Sort by row index
    section_starts.sort(key=lambda x: x[0])

    print(f"  Found {len(section_starts)} section(s): {[s[1] for s in section_starts]}")

    # Split dataframe into sections
    sections = {}
    for i, (start_row, section_name) in enumerate(section_starts):
        # End row is the start of next section, "END" marker, or end of dataframe
        if i + 1 < len(section_starts):
            end_row = section_starts[i + 1][0]
        else:
            # Look for END marker
            end_row = len(df_raw)
            for row_idx in range(start_row, len(df_raw)):
                for col_idx in range(min(3, len(df_raw.columns))):
                    val = df_raw.iloc[row_idx, col_idx]
                    if isinstance(val, str) and val.strip() == "END":
                        end_row = row_idx
                        break
                else:
                    continue
                break

        # Find the header row (skip section marker row and any empty rows)
        # The header row contains column names like "Tech #", "Technology", etc.
        header_row = start_row + 1
        for row_idx in range(start_row + 1, min(start_row + 5, end_row)):
            # Check if this row has meaningful content (header row)
            row_vals = df_raw.iloc[row_idx].dropna()
            if len(row_vals) > 3:  # noqa: PLR2004
                header_row = row_idx
                break

        # Extract section data starting from header row
        section_df = df_raw.iloc[header_row:end_row].copy()

        # Set column names from the first row of the section (header row)
        section_df.columns = section_df.iloc[0]
        section_df = section_df.iloc[1:].reset_index(drop=True)

        # Drop columns with NaN names and columns that are all NaN
        section_df = section_df.loc[:, section_df.columns.notna()]
        section_df = section_df.dropna(axis=1, how="all")

        # Drop rows that are completely empty (all NaN)
        section_df = section_df.dropna(how="all")

        # Convert year columns from float to int (e.g., 2020.0 -> 2020)
        new_columns = []
        for col in section_df.columns:
            if isinstance(col, float) and col > 1900 and col < 2100:  # noqa: PLR2004
                new_columns.append(int(col))
            else:
                new_columns.append(col)
        section_df.columns = new_columns

        sections[section_name] = section_df
        print(f"  {section_name}: {len(section_df)} rows, {len(section_df.columns)} columns")

    return sections

## Data Loading and Pre-processing

### Load Data from Google Sheets
**Forestry MMFT Data**

In [None]:
forest_data: dict[str, pd.DataFrame] = {}

for table_name, sheet_name in FORESTRY_MMFT_SHEET_NAMES.items():
    forest_data[table_name] = pd.read_excel(
        INPUT_DATA_DIR / "D3.3-Forestry_MMFT_VZZ.xlsx", sheet_name=sheet_name, header=2
    )
    print(f"Loaded {sheet_name}: {len(forest_data[table_name])} rows")

Loaded Model - Baseline (VZZ): 1627 rows
Loaded Scenario - Reimagining Progress: 1627 rows
Loaded Scenario - Fractured Continent: 1627 rows
Loaded Scenario - Corporate Epoch: 1627 rows


**Agriculture MMFT Data**

In [None]:
agriculture_data: dict[str, pd.DataFrame] = {}

for table_name, sheet_name in AGRICULTURE_MMFT_SHEET_NAMES.items():
    agriculture_data[table_name] = pd.read_excel(
        INPUT_DATA_DIR / "D3.3-Agriculture_MMFT_VZZ.xlsx", sheet_name=sheet_name, header=3
    )
    print(f"Loaded {sheet_name}: {len(agriculture_data[table_name])} rows")

Loaded Model - Baseline: 4781 rows
Loaded Scenario - Reimagining Progress: 4779 rows
Loaded Scenario - Fractured Continent: 4779 rows
Loaded Scenario - Corporate Epoch: 4779 rows


### Clean and Transform Data
**Forestry MMFT Data**

In [None]:
OUTPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)
for scenario, df in forest_data.items():
    print(f"Splitting forestry {scenario} data by sections:")
    forest_sections = split_by_sections(df)

    for n, (section_name, section_df) in enumerate(forest_sections.items()):
        section_df["Indicator"] = section_name
        if n == 0:
            forest_combined = section_df
        else:
            forest_combined = pd.concat([forest_combined, section_df], ignore_index=True)

    forest_combined.to_csv(OUTPUT_DATA_DIR / f"forestry_{scenario}.csv", index=False)

Splitting forestry baseline data by sections:
  Found 7 section(s): ['market_potential', 'addressable_market', 'penetration', 'shipments', 'installed_base', 'prices', 'revenues']
  market_potential: 238 rows, 29 columns
  addressable_market: 238 rows, 29 columns
  penetration: 204 rows, 29 columns
  shipments: 238 rows, 29 columns
  installed_base: 204 rows, 29 columns
  prices: 238 rows, 29 columns
  revenues: 238 rows, 29 columns
Splitting forestry reimagining_progress data by sections:
  Found 7 section(s): ['market_potential', 'addressable_market', 'penetration', 'shipments', 'installed_base', 'prices', 'revenues']
  market_potential: 238 rows, 29 columns
  addressable_market: 238 rows, 29 columns
  penetration: 204 rows, 29 columns
  shipments: 238 rows, 29 columns
  installed_base: 204 rows, 29 columns
  prices: 238 rows, 29 columns
  revenues: 238 rows, 29 columns
Splitting forestry fractured_continent data by sections:
  Found 7 section(s): ['market_potential', 'addressable_mar

**Agriculture MMFT Data**

In [None]:
for scenario, df in agriculture_data.items():
    print(f"Splitting agriculture {scenario} data by sections:")
    agriculture_sections = split_by_sections(df)

    for n, (section_name, section_df) in enumerate(agriculture_sections.items()):
        section_df["Indicator"] = section_name
        if n == 0:
            agriculture_combined = section_df
        else:
            agriculture_combined = pd.concat([agriculture_combined, section_df], ignore_index=True)

    agriculture_combined.to_csv(OUTPUT_DATA_DIR / f"agriculture_{scenario}.csv", index=False)

Splitting agriculture baseline data by sections:
  Found 7 section(s): ['market_potential', 'addressable_market', 'penetration', 'shipments', 'installed_base', 'prices', 'revenues']
  market_potential: 646 rows, 28 columns
  addressable_market: 646 rows, 27 columns
  penetration: 646 rows, 27 columns
  shipments: 714 rows, 27 columns
  installed_base: 714 rows, 27 columns
  prices: 646 rows, 26 columns
  revenues: 714 rows, 27 columns
Splitting agriculture reimagining_progress data by sections:
  Found 6 section(s): ['addressable_market', 'penetration', 'shipments', 'installed_base', 'prices', 'revenues']
  addressable_market: 646 rows, 27 columns
  penetration: 646 rows, 27 columns
  shipments: 714 rows, 27 columns
  installed_base: 714 rows, 27 columns
  prices: 646 rows, 26 columns
  revenues: 714 rows, 27 columns
Splitting agriculture fractured_continent data by sections:
  Found 6 section(s): ['addressable_market', 'penetration', 'shipments', 'installed_base', 'prices', 'revenues'