# Validation Error Analysis - Bridge Failure Prediction

In [1]:
import pandas as pd
from src.validation.data_validation import DataValidator, DataValidationError
df = pd.read_csv('../data/processed/features.csv')

## Schema and Required Columns

In [2]:
schema = {
    "structure_id": "object",
    "inspection_date": "datetime64[ns]",
    "last_maintenance_date": "datetime64[ns]",
    "avg_daily_traffic": "float64",
    "bridge_condition": "object",
    "failure_within_1yr": "int64",
    "latitude": "float64",
    "longitude": "float64",
    "precipitation": "float64",
    "avg_temp": "float64",
    "corrosion_level": "float64",
    "previous_failures": "int64",
    "soil_type": "object",
    "region_code": "object"
}

## Find and Analyze Validation Errors

In [3]:
validator = DataValidator(schema=schema)
try:
    validator.validate_columns(df)
    df_ok = validator.validate_types(df)
    print('Validation PASSED')
except DataValidationError as e:
    print('Validation FAILED:', e)
    # Example: find rows with type errors
    for col, dtype in schema.items():
        try:
            df[col].astype(dtype)
        except Exception as ex:
            print(f'Column {col} type error: {ex}')

## Nulls and Duplicates

In [4]:
nulls = df.isnull().sum()
print('Nulls per column:')
print(nulls[nulls > 0])
dupes = df.duplicated(subset=['structure_id', 'inspection_date'])
print('Duplicate rows:', dupes.sum())