# Validate some predictions

In [15]:
import geopandas as gpd
import pandas as pd
import os.path
from typing import List, Optional, Dict

from veg2hab import validation

## Zuidlaardermeer

### Inlezen van de data

Jammergenoeg bevat de duurswold habitatkartering geen percentages

In [16]:
gdf_pred_raw = gpd.read_file('../testing/GR/SGL Zuidlaardermeer 2019.shp')
gdf_pred_raw.head(2)

Unnamed: 0,Area,Opm,Datum,_ChkNodig,Habtype1,Perc1,Opp1,Kwal1,Opm1,VvN1,...,Kwal9,Opm9,VvN9,SBB9,_Status9,_VvNdftbl9,_SBBdftbl9,_VgTypInf9,_ChkNodig9,geometry
0,3692.336448,nvt,20190715.0,1,HXXXX,19,70154.39251248693,Onbekend,Er zijn mitsen met nog niet geimplementeerde c...,8bc2a,...,,,,,,,,,,"POLYGON ((242077.412 573207.577, 242082.572 57..."
1,1238.369197,nvt,20190715.0,0,H0000,70,86685.84377139427,,,,...,,,,,,,,,,"POLYGON ((243824.566 573054.300, 243860.398 57..."


In [17]:
gdf_pred_raw.Habtype1.value_counts()

HXXXX      635
H0000      133
H7140_B     15
Name: Habtype1, dtype: int64

In [18]:
gdf_true_raw = gpd.read_file("/mnt/c/Users/MarkBoer/OneDrive - Spheer AI/General/Projecten/Veg_2_Hab/Data/Habitatkarteringen/Habitattypekaarten Gr/NaamGebied_Oeverlanden Zuidlaardermeergebied.gpkg")
gdf_true_raw.head(2)

Unnamed: 0,NaamGebied,Habitattyp,Jaar,Deelgebied,StikstofGe,Habtype1,Habtype2,Habtype3,Shape_Leng,Shape_Area,Opp_ha,geometry
0,Oeverlanden Zuidlaardermeergebied,H0000,2019,Groningen : Kruishammen tot Meerwijck,,H0000,,,411.040756,3692.339806,0.369234,"MULTIPOLYGON (((242095.272 573036.126, 242086...."
1,Oeverlanden Zuidlaardermeergebied,H0000,2019,Groningen: Leinwijk,,H0000,,,491.252931,1238.37068,0.123837,"MULTIPOLYGON (((243824.566 573054.300, 243860...."


In [19]:
# minder dan 1% van de data bevat meer dan 1 habitattype
gdf_true_raw.Habtype2.notnull().mean()

0.007662835249042145

In [20]:
print(gdf_true_raw.Habtype1.value_counts())
print(gdf_true_raw.Habtype2.value_counts())

H0000      771
H7140_B     12
Name: Habtype1, dtype: int64
H0000      5
H7140_B    1
Name: Habtype2, dtype: int64


### Convert to a single columned dataset

In [22]:
gdf_pred = validation.parse_habitat_percentages(gdf_pred_raw)

In [23]:
gdf_true = validation.parse_habitat_percentages(gdf_true_raw, percentage_cols=None, how_to_handle_missing_percentages="split_equally")

### Combine datasets

In [24]:
gdf_combined = validation.spatial_join(gdf_pred, gdf_true, how="intersection")



In [25]:
output_gdf = validation.voeg_correctheid_toe_aan_df(gdf_combined)
output_gdf.head(2)

Unnamed: 0,pred_hab_perc,true_hab_perc,geometry,percentage_correct,oppervlakte_correct
0,"{'HXXXX': 48.0, 'H0000': 52.0}",{'H0000': 100.0},"POLYGON ((242082.572 573196.861, 242081.572 57...",52.0,192001.37087
9,{'HXXXX': 100},{'H0000': 100.0},"POLYGON ((242048.039 573126.390, 242047.638 57...",0.0,0.0


In [26]:
output_gdf.to_file("../testing/correctheid_percentage_intersection.gpkg", driver="GPKG")

#### Create confusion matrix

In [30]:
validation.bereken_volledige_conf_matrix(gdf_combined, method="area")

true_hab,H0000,H7140_B,HXXXX
pred_hab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H0000,123.611986,0.035147,0.0
H7140_B,0.314803,3.464846,0.0
HXXXX,686.294541,0.27511,0.0


In [29]:
validation.bereken_volledige_conf_matrix(gdf_combined, method="percentage")

true_hab,H0000,H7140_B,HXXXX
pred_hab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H0000,318.05,0.14,0.0
H7140_B,1.19,8.8,0.0
HXXXX,453.76,1.06,0.0
