In [16]:
import geopandas as gpd
import pandas as pd

In [17]:
file_name = r"C:\Users\rhamilton\LiDAR\working\[06]-gdb\nb.gdb"
layer = "nb_south_train_subset"
gdf = gpd.read_file(file_name, layer=layer, driver="FileGDB")

In [18]:
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,Open water,121,POINT (2282120.928 1090781.570)
1,Open water,121,POINT (2280032.116 1094585.955)
2,Open water,121,POINT (2262984.057 1088554.152)
3,Open water,121,POINT (2203301.426 1138927.607)
4,Open water,121,POINT (2271986.798 1080146.534)


In [19]:
wetland_classes = [
    "Shallow water",
    "Swamp",
    "Fen",
    "Marsh",
    "Bog",
    "Salt marsh",
]

# isolate just the wetland classes
gdf = gdf[gdf['class_name'].isin(wetland_classes)]
gdf.sort_values(by='class_name', inplace=True)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
1991,Bog,121,POINT (2331726.572 1145821.905)
4595,Bog,122,POINT (2246024.863 1190900.611)
4594,Bog,122,POINT (2269285.217 1222886.568)
4593,Bog,122,POINT (2242953.246 1093873.249)
4592,Bog,122,POINT (2291245.763 1178796.549)


In [20]:
freq = gdf['class_name'].value_counts()
freq

class_name
Fen              1227
Shallow water    1177
Bog              1103
Swamp            1074
Salt marsh        969
Marsh             964
Name: count, dtype: int64

In [21]:
n = round(freq.min() / 10) * 10
dfs = []
for _, df in gdf.groupby("class_name"):
    if len(df) > n:
        print("DF is greater than n")
        dfs.append(df.sample(n))
    else:
        dfs.append(df)

gdf = pd.concat(dfs, ignore_index=True)

DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n
DF is greater than n


In [22]:
gdf["class_name"].value_counts()

class_name
Bog              960
Fen              960
Marsh            960
Salt marsh       960
Shallow water    960
Swamp            960
Name: count, dtype: int64

In [23]:
labels = gdf["class_name"].unique().tolist()
values = {v: k for k, v in enumerate(labels, start=1)}

gdf["class_name"] = gdf["class_name"].map(values)
gdf.sort_values(by="class_name", inplace=True, ignore_index=True)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,1,121,POINT (2245243.699 1068153.373)
1,1,123,POINT (2275144.188 1067268.690)
2,1,123,POINT (2286713.736 1071089.231)
3,1,123,POINT (2291317.029 1078779.172)
4,1,123,POINT (2288253.939 1071437.433)


In [24]:
import numpy as np

n = len(gdf)
low = 0
high = 1

uniform_dist = np.random.uniform(low, high, n)
gdf = gdf.assign(random=uniform_dist)

In [25]:
# write out a shapefile in processed data/processed
gdf = gdf[["class_name", "random", "geometry"]]
gdf

Unnamed: 0,class_name,random,geometry
0,1,0.544263,POINT (2245243.699 1068153.373)
1,1,0.769209,POINT (2275144.188 1067268.690)
2,1,0.303008,POINT (2286713.736 1071089.231)
3,1,0.251294,POINT (2291317.029 1078779.172)
4,1,0.955502,POINT (2288253.939 1071437.433)
...,...,...,...
5755,6,0.365064,POINT (2269546.093 1081095.723)
5756,6,0.252724,POINT (2332198.305 1115512.637)
5757,6,0.771156,POINT (2277081.693 1225907.050)
5758,6,0.674995,POINT (2276425.207 1175277.342)


In [26]:
from pathlib import Path

interm = Path(".").absolute().parent / "data/interim"

out_file_name = interm / "nb_south_wetland_only_class_bal.shp"

gdf.to_file(out_file_name, driver="ESRI Shapefile")

In [27]:
from zipfile import ZipFile


processed = Path(".").absolute().parent / "data/processed"
zip_file_name = processed / "wetland_only.zip"

files = interm.glob("nb_south_wetland_only_class_bal.*")


with ZipFile(zip_file_name, "w") as zipf:
    for file in files:
        zipf.write(file, arcname=file.name)