In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import sys
import yaml
sys.path.append('../')


from utils.save_gdf_to_gdb import save_gdf_to_gdb

**Validation Sequence:**
1. Geometric validity
2. Administering organization completeness (for COUNTS_TO_MAS records)
3. Agency completeness (for COUNTS_TO_MAS records)
4. Region completeness
5. Vegetation type completeness
6. Ownership completeness
7. ACTIVE status authorization
8. Activity category logic
9. Overall null value thresholds

## Load data sources from config
**Data Sources:**
- Layer: `appended_poly` (polygon features)
- Layer: `appended_point` (point features)
- Layer: `appended_line` (line features)

In [None]:
with open("..\config.yaml", 'r') as stream:
    config_inputs = yaml.safe_load(stream)

append_path = config_inputs['appended']['gdb_path']


In [None]:
enriched_polygons = gpd.read_file(append_path, driver='OpenFileGDB', layer='appended_poly')
enriched_points = gpd.read_file(append_path, driver='OpenFileGDB', layer='appended_point')
enriched_lines = gpd.read_file(append_path, driver='OpenFileGDB', layer='appended_line')

## Geometry Validation Check
This validation function checks that every geometry in the points, lines, and polygons datasets is geometrically valid according to the Simple Features specification. Invalid geometries can include self-intersections, unclosed rings, or other topological errors that could cause issues in downstream spatial analysis.

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if any invalid geometries are detected

In [None]:



def check_valid(enriched_points, enriched_lines, enriched_polygons):
    assert np.all(enriched_points.is_valid)
    assert np.all(enriched_lines.is_valid)
    assert np.all(enriched_polygons.is_valid)


In [None]:
check_valid(enriched_points, enriched_lines, enriched_polygons)

## Administering Organization Validation
ADMINISTERING_ORG is mandatory when COUNTS_TO_MAS = "YES"

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if any COUNTS_TO_MAS records lack an ADMINISTERING_ORG


In [None]:


def check_admin_org_null(enriched_points, enriched_lines, enriched_polygons):
    # Admin org must be populated if counts to mas
    assert len(enriched_points[(enriched_points.COUNTS_TO_MAS=="YES") & (enriched_points.ADMINISTERING_ORG.isna())]) == 0
    assert len(enriched_lines[(enriched_lines.COUNTS_TO_MAS=="YES") & (enriched_lines.ADMINISTERING_ORG.isna())]) == 0
    assert len(enriched_polygons[(enriched_polygons.COUNTS_TO_MAS=="YES") & (enriched_polygons.ADMINISTERING_ORG.isna())]) == 0

In [None]:
check_admin_org_null(enriched_points, enriched_lines, enriched_polygons)

## Agency Field Validation
Similar to the administering organization check, this function enforces that the AGENCY field is populated for all features marked to count toward Million Acres Strategy.

AGENCY is mandatory when COUNTS_TO_MAS = "YES"

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if any COUNTS_TO_MAS records lack an agency designation

In [None]:

def check_agency_null(enriched_points, enriched_lines, enriched_polygons):
    # Agency must be populated if counts to mas
    assert len(enriched_points[(enriched_points.COUNTS_TO_MAS=="YES") & (enriched_points.AGENCY.isna())]) == 0
    assert len(enriched_lines[(enriched_lines.COUNTS_TO_MAS=="YES") & (enriched_lines.AGENCY.isna())]) == 0
    assert len(enriched_polygons[(enriched_polygons.COUNTS_TO_MAS=="YES") & (enriched_polygons.AGENCY.isna())]) == 0

In [None]:
check_agency_null(enriched_points, enriched_lines, enriched_polygons)

##  Region/Vegetation/Ownership Field Validation
Ensures Taskforce region, broad vegetation type, ownership classification is complete across all features.
This applies to ALL records, not just those marked as COUNTS_TO_MAS.

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if any records lack a Taskforce region, broad vegetation type, or ownership assignment

In [None]:
def check_region_null(enriched_points, enriched_lines, enriched_polygons):
    # Region, vegetation type, ownership must be populated 
    assert len(enriched_points[enriched_points.REGION.isna()]) == 0
    assert len(enriched_lines[enriched_lines.REGION.isna()]) == 0
    assert len(enriched_polygons[enriched_polygons.REGION.isna()]) == 0
    

def check_veg_null(enriched_points, enriched_lines, enriched_polygons):
    # Region, vegetation type, ownership must be populated 
    assert len(enriched_points[enriched_points.BROAD_VEGETATION_TYPE.isna()]) == 0
    assert len(enriched_lines[enriched_lines.BROAD_VEGETATION_TYPE.isna()]) == 0
    assert len(enriched_polygons[enriched_polygons.BROAD_VEGETATION_TYPE.isna()]) == 0

    
    
def check_ownership_null(enriched_points, enriched_lines, enriched_polygons):
    # Region, vegetation type, ownership must be populated 
    assert len(enriched_points[enriched_points.PRIMARY_OWNERSHIP_GROUP.isna()]) == 0
    assert len(enriched_lines[enriched_lines.PRIMARY_OWNERSHIP_GROUP.isna()]) == 0
    assert len(enriched_polygons[enriched_polygons.PRIMARY_OWNERSHIP_GROUP.isna()]) == 0

In [None]:
check_region_null(enriched_points, enriched_lines, enriched_polygons)

check_veg_null(enriched_points, enriched_lines, enriched_polygons)

In [None]:
check_ownership_null(enriched_points, enriched_lines, enriched_polygons)

## Active Activity Status Validation
ACTIVITY_STATUS = "ACTIVE" is exclusive to AGENCY = "CNRA" for COUNTS_TO_MAS records

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if non-CNRA agencies have ACTIVE status

In [None]:
def check_active_status(enriched_points, enriched_lines, enriched_polygons):
    # only CNRA is allowed to have ACTIVE status
    assert enriched_points[(enriched_points.COUNTS_TO_MAS == 'YES') & (enriched_points.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()) == 1
    assert enriched_points[(enriched_points.COUNTS_TO_MAS == 'YES') & (enriched_points.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()[0] == 'CNRA'
    assert len(enriched_lines[(enriched_lines.COUNTS_TO_MAS == 'YES') & (enriched_lines.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()) == 1
    assert enriched_lines[(enriched_lines.COUNTS_TO_MAS == 'YES') & (enriched_lines.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()[0] == 'CNRA'
    assert len(enriched_polygons[(enriched_polygons.COUNTS_TO_MAS == 'YES') & (enriched_polygons.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()) == 1
    assert enriched_polygons[(enriched_polygons.COUNTS_TO_MAS == 'YES') & (enriched_polygons.ACTIVITY_STATUS == 'ACTIVE')].AGENCY.unique()[0] == 'CNRA'
    

In [None]:
check_active_status(enriched_points, enriched_lines, enriched_polygons)

## Activity Category Validation
1. **Completeness Check:** Verifies that ACTIVITY_CAT is populated for all features
2. **Logic Check:** Ensures that when ACTIVITY_CAT = "NOT_DEFINED", the corresponding ACTIVITY_DESCRIPTION must be either "TBD" (To Be Determined) or "NOT_DEFINED"

This prevents inconsistent categorization where defined activities are incorrectly marked as NOT_DEFINED, ensuring data quality in activity classification.


**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if categorization is missing or logically inconsistent

In [None]:

def check_activity_null(enriched_points, enriched_lines, enriched_polygons):
    # Activity category must be populated 
    assert len(enriched_lines[enriched_lines.ACTIVITY_CAT.isna()]) == 0
    assert len(enriched_points[enriched_points.ACTIVITY_CAT.isna()]) == 0
    assert len(enriched_polygons[enriched_polygons.ACTIVITY_CAT.isna()]) == 0
    # Activity category with NOT_DEFINED can only come from activity description of TBD or NOT_DEFINED
    assert set(enriched_points[enriched_points.ACTIVITY_CAT == 'NOT_DEFINED'].ACTIVITY_DESCRIPTION.unique()).issubset(set(['TBD', 'NOT_DEFINED']))
    assert set(enriched_lines[enriched_lines.ACTIVITY_CAT == 'NOT_DEFINED'].ACTIVITY_DESCRIPTION.unique()).issubset(set(['TBD', 'NOT_DEFINED']))
    assert set(enriched_polygons[enriched_polygons.ACTIVITY_CAT == 'NOT_DEFINED'].ACTIVITY_DESCRIPTION.unique()).issubset(set(['TBD', 'NOT_DEFINED']))


In [None]:
check_activity_null(enriched_points, enriched_lines, enriched_polygons)

## Null Value Threshold Validation
Ensures overall data quality by limiting null values across critical fields.

This function performs a comprehensive data quality check across 13 essential fields, ensuring that no more than 1% (default threshold) of records have null values in any of these fields. This is a statistical quality control measure that catches systematic data collection or processing issues.

**Monitored Fields:**
- AGENCY
- ADMINISTERING_ORG
- PRIMARY_OWNERSHIP_GROUP
- COUNTY
- REGION
- ACTIVITY_DESCRIPTION
- ACTIVITY_CAT
- BROAD_VEGETATION_TYPE
- ACTIVITY_STATUS
- ACTIVITY_QUANTITY
- ACTIVITY_UOM (Unit of Measure)
- ACTIVITY_END
- Year_txt

**Parameters:**
- `na_thresh`: Maximum allowable null ratio (default: 0.01 or 1%)

**Validates:** Points, Lines, and Polygons

**Raises:** AssertionError if any field exceeds the null value threshold

In [None]:
def thresh_null_ratio(enriched_points, enriched_lines, enriched_polygons, na_thresh = 0.01):
    # Threshold of na %
    cols =["AGENCY",
        "ADMINISTERING_ORG",
        "PRIMARY_OWNERSHIP_GROUP",
        "COUNTY",
        "REGION",
        "ACTIVITY_DESCRIPTION",
        "ACTIVITY_CAT",
        "BROAD_VEGETATION_TYPE",
        "ACTIVITY_STATUS",
        "ACTIVITY_QUANTITY",
        "ACTIVITY_UOM",
        "ACTIVITY_END",
        "Year_txt"]
    
    for col in cols:
        assert sum(enriched_lines[col].isna())/len(enriched_lines) < na_thresh
        assert sum(enriched_points[col].isna())/len(enriched_points) < na_thresh
        assert sum(enriched_polygons[col].isna())/len(enriched_polygons) < na_thresh



In [None]:
thresh_null_ratio(enriched_points, enriched_lines, enriched_polygons)