In [9]:
import pandas as pd

from veg2hab.vegetatietypen import VvN, SBB, rVvN

wwl = pd.read_excel(
    "../data/5. Was-wordt-lijst-vegetatietypen-en-habitattypen-09-02-2021.xlsx", 
    engine="openpyxl", 
    usecols=["rVvN", "VvN", "SBB-code"]
).rename(columns={"SBB-code": "SBB"})

  warn("""Cannot parse header or footer so it will be ignored""")


In [10]:
# Cleanup
def opschonen_waswordtlijst(wwl: pd.DataFrame) -> pd.DataFrame:
    wwl = wwl.rename(columns={"SBB-code": "SBB"})
    wwl = wwl.dropna(how="all")

    # Rijen met meerdere VvN in 1 cel opsplitsen
    wwl["VvN"] = wwl["VvN"].str.split(",")
    wwl = wwl.explode("VvN")

    # Whitespace velden vervangen door None
    wwl = wwl.replace(r"^\s*$", None, regex=True)

    wwl["VvN"] = VvN.opschonen_series(wwl["VvN"])
    wwl["SBB"] = SBB.opschonen_series(wwl["SBB"])
    wwl["rVvN"] = rVvN.opschonen_series(wwl["rVvN"])

    # Checken
    assert SBB.validate_pandas_series(
        wwl["SBB"], print_invalid=True
    ), "Niet alle SBB codes zijn valid"
    assert VvN.validate_pandas_series(
        wwl["VvN"], print_invalid=True
    ), "Niet alle VvN codes zijn valid"

    # Vanaf hier nieuw voor (ook) rVvN (later nog even op mooie logische plek zetten)

    # # alle nan naar None
    # wwl = wwl.where(pd.notnull(wwl), None)

    # wwl.rVvN.loc[wwl.rVvN == "Niet overgenomen in Revisie"] = None
    # wwl.rVvN.loc[wwl.rVvN == "Niet overgenomen in Revisie (grasland-deel)"] = None
    # wwl.rVvN.loc[wwl.rVvN == "r43A0A1B"] = "r43AA1B"

    # Tot hier nieuw

    return wwl

wwl = opschonen_waswordtlijst(wwl)

In [11]:
(wwl.rVvN == "r43a0a1b")

0       False
1       False
2       False
3       False
4       False
        ...  
1235    False
1236    False
1237    False
1238    False
1239    False
Name: rVvN, Length: 1256, dtype: bool

In [12]:
wwl

Unnamed: 0,rVvN,VvN,SBB
0,,14rg10,14-l
1,,,7-a
2,,,8b-b
3,,,11/c
4,,,11a-b
...,...,...,...
1235,r46rg8,,43-d
1236,r46rg9,,42-d
1237,r46rg10,,
1238,r46rg11,,43c1j


In [13]:
# Moet rVvN naar zowel SBB als VvN?
print("Aantal rVvN naar enkel SBB")
print(len(wwl[wwl.rVvN.notna() & wwl.VvN.isna() & wwl.SBB.notna()]))

print("Aantal rVvN naar enkel VvN")
print(len(wwl[wwl.rVvN.notna() & wwl.SBB.isna() & wwl.VvN.notna()]))

print("Aantal rVvN naar zowel SBB als VvN")
print(len(wwl[wwl.rVvN.notna() & wwl.VvN.notna() & wwl.SBB.notna()]))

print("Aantal rVvN naar geen van beide")
print(len(wwl[wwl.rVvN.notna() & wwl.VvN.isna() & wwl.SBB.isna()]))

Aantal rVvN naar enkel SBB
110
Aantal rVvN naar enkel VvN
196
Aantal rVvN naar zowel SBB als VvN
771
Aantal rVvN naar geen van beide
157


In [14]:
rijen_met_rVvN_en_iets_anders = wwl[wwl.rVvN.notna() & (wwl.VvN.notna() | wwl.SBB.notna())]
rijen_met_enkel_rVvN = wwl[wwl.rVvN.notna() & wwl.VvN.isna() & wwl.SBB.isna()]

print(rijen_met_enkel_rVvN.rVvN.unique())

rijen_met_enkel_rVvN[rijen_met_enkel_rVvN.rVvN.isin(rijen_met_rVvN_en_iets_anders.rVvN)]

['r1rg3' 'r2' 'r4rg3' 'r5ba4' 'r5dg1' 'r5rg9' 'r5rg13' 'r5rg17' 'r6rg1'
 'r6rg4' 'r6rg5' 'r7ab' 'r7ab1' 'r8rg1' 'r8rg11' 'r8rg17' 'r8rg19'
 'r8rg20' 'r8rg21' 'r9ba6' 'r9rg9' 'r9rg11' 'r9rg12' 'r9rg14' 'r10' 'r11'
 'r12' 'r12aa1e' 'r12rg3' 'r12rg10' 'r14' 'r14rg11' 'r14rg12' 'r14rg14'
 'r14rg21' 'r15' 'r16dg1' 'r16rg3' 'r16rg4' 'r16rg6' 'r16rg14' 'r16rg20'
 'r16rg21' 'r16rg22' 'r16rg24' 'r16rg26' 'r17' 'r18' 'r18aa3' 'r18aa3a'
 'r18aa3b' 'r18ab' 'r18ab1' 'r18ac' 'r18ac1' 'r18ac2' 'r18rg4' 'r18rg5'
 'r18rg6' 'r19' 'r19aa1a' 'r19aa1b' 'r19aa5b' 'r20' 'r20dg1' 'r21'
 'r21rg5' 'r22' 'r22a' 'r22aa' 'r22aa1' 'r22rg1' 'r23' 'r24' 'r24rg3'
 'r25' 'r26' 'r26aa3' 'r27' 'r27rg5' 'r27rg6' 'r27rg7' 'r28' 'r29'
 'r29rg1' 'r30' 'r30rg1' 'r30rg2' 'r30rg3' 'r30rg6' 'r30rg7' 'r31' 'r31ab'
 'r31b' 'r31ba' 'r31rg7' 'r31rg8' 'r31rg9' 'r32' 'r32rg3' 'r32rg9'
 'r32rg10' 'r33' 'r33dg2' 'r34' 'r34b' 'r34ba' 'r34ba1' 'r34dg3' 'r35'
 'r36' 'r36aa2' 'r36aa4' 'r36aa4a' 'r36aa4b' 'r37' 'r37a' 'r37aa' 'r37aa2'
 'r37a

Unnamed: 0,rVvN,VvN,SBB
656,r19aa1a,,


In [15]:
schoon = rVvN.opschonen_series(wwl.rVvN)
schoon

0          None
1          None
2          None
3          None
4          None
         ...   
1235     r46rg8
1236     r46rg9
1237    r46rg10
1238    r46rg11
1239       None
Name: rVvN, Length: 1256, dtype: object

In [16]:
ingeladen = schoon.apply(rVvN.from_string)