In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
file_name = r"C:\Users\rhamilton\LiDAR\working\[06]-gdb\nb.gdb"
layer = "nb_south_train_subset"
gdf = gpd.read_file(file_name, layer=layer, driver="FileGDB")

In [3]:
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,Open water,121,POINT (2282120.928 1090781.570)
1,Open water,121,POINT (2280032.116 1094585.955)
2,Open water,121,POINT (2262984.057 1088554.152)
3,Open water,121,POINT (2203301.426 1138927.607)
4,Open water,121,POINT (2271986.798 1080146.534)


In [4]:
wetland_classes = [
    "Shallow water",
    "Swamp",
    "Fen",
    "Marsh",
    "Bog",
    "Salt marsh",
]

# isolate just the wetland classes
gdf = gdf[gdf['class_name'].isin(wetland_classes)]
gdf.sort_values(by='class_name', inplace=True)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
1991,Bog,121,POINT (2331726.572 1145821.905)
4595,Bog,122,POINT (2246024.863 1190900.611)
4594,Bog,122,POINT (2269285.217 1222886.568)
4593,Bog,122,POINT (2242953.246 1093873.249)
4592,Bog,122,POINT (2291245.763 1178796.549)


In [5]:
freq = gdf['class_name'].value_counts()
freq

class_name
Fen              1227
Shallow water    1177
Bog              1103
Swamp            1074
Salt marsh        969
Marsh             964
Name: count, dtype: int64

In [6]:
n = round(freq.min() / 10) * 10
dfs = []
for _, df in gdf.groupby("class_name"):
    if len(df) > n:
        print("DF is greater than n")
        dfs.append(df.sample(n))
    else:
        dfs.append(df)

gdf = pd.concat(dfs, ignore_index=True)

DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n


In [7]:
gdf["class_name"].value_counts()

class_name
Bog              960
Fen              960
Marsh            960
Salt marsh       960
Shallow water    960
Swamp            960
Name: count, dtype: int64

In [8]:
labels = gdf["class_name"].unique().tolist()
values = {v: k for k, v in enumerate(labels, start=1)}

gdf["class_name"] = gdf["class_name"].map(values)
gdf.sort_values(by="class_name", inplace=True, ignore_index=True)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,1,121,POINT (2229765.050 1084580.078)
1,1,122,POINT (2254931.922 1094552.299)
2,1,123,POINT (2331729.216 1114653.960)
3,1,121,POINT (2211940.406 1113782.203)
4,1,121,POINT (2350909.642 1167005.254)


In [9]:
import numpy as np

n = len(gdf)
low = 0
high = 1

uniform_dist = np.random.uniform(low, high, n)
gdf = gdf.assign(random=uniform_dist)

In [10]:
# write out a shapefile in processed data/processed
gdf = gdf[["class_name", "random", "geometry"]]
gdf

Unnamed: 0,class_name,random,geometry
0,1,0.503604,POINT (2229765.050 1084580.078)
1,1,0.645717,POINT (2254931.922 1094552.299)
2,1,0.610718,POINT (2331729.216 1114653.960)
3,1,0.337440,POINT (2211940.406 1113782.203)
4,1,0.663592,POINT (2350909.642 1167005.254)
...,...,...,...
5755,6,0.648462,POINT (2271741.395 1108001.024)
5756,6,0.926769,POINT (2326685.150 1175950.219)
5757,6,0.093739,POINT (2351373.483 1156382.254)
5758,6,0.583453,POINT (2334286.214 1178897.105)


In [11]:
from pathlib import Path

interm = Path(".").absolute().parent / "data/interim"

out_file_name = interm / "nb_south_wetland_only_class_bal.shp"

gdf.to_file(out_file_name, driver="ESRI Shapefile")

In [12]:
from zipfile import ZipFile


processed = Path(".").absolute().parent / "data/processed"
zip_file_name = processed / "wetland_only.zip"

files = interm.glob("nb_south_wetland_only_class_bal.*")


with ZipFile(zip_file_name, "w") as zipf:
    for file in files:
        zipf.write(file, arcname=file.name)

In [13]:
values

{'Bog': 1,
 'Fen': 2,
 'Marsh': 3,
 'Salt marsh': 4,
 'Shallow water': 5,
 'Swamp': 6}

In [15]:
import json

with open("../references/wetlands_only.json", 'w') as jfh:
    json.dump(values, jfh, indent=4)