# Validate some predictions

In [1]:
import geopandas as gpd
import pandas as pd
import os.path
from typing import List, Optional, Dict

from veg2hab import validation



## Duurswold

### Inlezen van de data

Jammergenoeg bevat de duurswold habitatkartering geen percentages

In [2]:
gdf_pred = gpd.read_file('../testing/GR/SBB Duurswold2013.shp')
gdf_pred.head(2)

Unnamed: 0,Area,Opm,Datum,_ChkNodig,Habtype1,Perc1,Opp1,Kwal1,Opm1,VvN1,...,Kwal4,Opm4,VvN4,SBB4,_Status4,_VvNdftbl4,_SBBdftbl4,_VgTypInf4,_ChkNodig4,geometry
0,2441.121497,,20130820,1,HXXXX,60,146467.28982207918,Onbekend,Er zijn habitatvoorstellen met mozaiekregels: ...,"['36aa2a', '36aa2a']",...,,,,,,,,,,"POLYGON ((249507.649 580089.327, 249505.080 58..."
1,10237.326199,,20130619,0,H0000,100,1023732.619868842,,,,...,,,,,,,,,,"POLYGON ((244969.210 578456.910, 244953.450 57..."


In [3]:
gdf_pred.Habtype1.value_counts()

H0000      366
HXXXX      212
H7140_B     28
Name: Habtype1, dtype: int64

In [4]:
gdf_true = gpd.read_file("/mnt/c/Users/MarkBoer/OneDrive - Spheer AI/General/Projecten/Veg_2_Hab/Data/Habitatkarteringen/Habitattypekaarten Gr/NaamGebied_Duurswold.gpkg")
gdf_true.head(2)

Unnamed: 0,NaamGebied,Habitattyp,Jaar,Deelgebied,StikstofGe,Habtype1,Habtype2,Habtype3,Shape_Leng,Shape_Area,Opp_ha,geometry
0,Duurswold,,2002,Duurswold,,,,,627.054446,6815.521377,0.681552,"MULTIPOLYGON (((251739.828 590172.070, 251758...."
1,Duurswold,H0000,2002,Duurswold,,H0000,,,256.228618,1391.099668,0.13911,"MULTIPOLYGON (((251664.257 589870.352, 251660...."


In [5]:
# 4% van de data bevat meer dan 1 habitattype
gdf_true.Habtype2.notnull().mean()

0.04020100502512563

In [6]:
print(gdf_true.Habtype1.value_counts())
print(gdf_true.Habtype2.value_counts())

H0000      176
H7140_B     19
Name: Habtype1, dtype: int64
H7140_B    5
H0000      3
Name: Habtype2, dtype: int64


### Convert to a single columned dataset

In [7]:
gdf_pred = validation.parse_habitat_percentages(gdf_pred)



In [8]:

gdf_true = validation.parse_habitat_percentages(gdf_true, percentage_cols=None, how_to_handle_missing_percentages="select_first")



### Combine datasets

In [9]:
gdf_combined = validation.spatial_join(gdf_pred, gdf_true, how="intersection")



In [10]:
output_gdf = validation.voeg_correctheid_toe_aan_df(gdf_combined)
output_gdf.head(2)

Unnamed: 0,pred_hab_perc,true_hab_perc,geometry,percentage_correct,oppervlakte_correct
0,{'HXXXX': 100},{'H0000': 100},"MULTIPOLYGON (((249073.659 587219.509, 249067....",0.0,0.0
1,"{'HXXXX': 10.0, 'H0000': 10.0}",{'H0000': 100},"MULTIPOLYGON (((249243.436 587268.853, 249242....",10.0,3276.783421


In [11]:

output_gdf.to_file("../testing/correctheid_percentage_intersection.gpkg", driver="GPKG")

#### Create confusion matrix

In [12]:
conf_matrix = validation.bereken_volledige_conf_matrix(gdf_combined, method="area")

In [14]:
validation.bereken_volledige_conf_matrix(gdf_combined, method="area")

true_hab,H0000,H7140_B,HXXXX
pred_hab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H0000,6.987196,0.298377,0.0
H7140_B,2.632198,1.383328,0.0
HXXXX,6.647128,2.31861,0.0
