In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
file_name = r"C:\Users\rhamilton\LiDAR\working\[06]-gdb\nb.gdb"
layer = "nb_south_train_subset"
gdf = gpd.read_file(file_name, layer=layer, driver="FileGDB")

In [3]:
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,Open water,121,POINT (2282120.928 1090781.570)
1,Open water,121,POINT (2280032.116 1094585.955)
2,Open water,121,POINT (2262984.057 1088554.152)
3,Open water,121,POINT (2203301.426 1138927.607)
4,Open water,121,POINT (2271986.798 1080146.534)


In [4]:
gdf.class_name.unique()

array(['Open water', 'Shallow water', 'Swamp', 'Fen', 'Marsh', 'Bog',
       'Salt marsh', 'Upland'], dtype=object)

In [5]:
mapping = {
    "Open water": "water",
    "Shallow water": "wetland",
    "Swamp": "wetland",
    "Fen": "wetland",
    "Marsh": "wetland",
    "Bog": "wetland",
    "Salt marsh": "wetland",
    "Upland": "non_wetland",
}

gdf["class_name"] = gdf["class_name"].map(mapping)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,water,121,POINT (2282120.928 1090781.570)
1,water,121,POINT (2280032.116 1094585.955)
2,water,121,POINT (2262984.057 1088554.152)
3,water,121,POINT (2203301.426 1138927.607)
4,water,121,POINT (2271986.798 1080146.534)


In [6]:
gdf["class_name"].value_counts()

class_name
wetland        6514
non_wetland    1174
water           879
Name: count, dtype: int64

In [7]:
n = 870
dfs = []
for _, df in gdf.groupby("class_name"):
    if len(df) > n:
        print("DF is greater than n")
        dfs.append(df.sample(n))
    else:
        dfs.append(df)

DF is greater than n
DF is greater than n
DF is greater than n


In [8]:
gdf = pd.concat(dfs, ignore_index=True)
gdf

Unnamed: 0,class_name,eco_region,geometry
0,non_wetland,122,POINT (2253675.865 1103145.791)
1,non_wetland,123,POINT (2332192.988 1136079.591)
2,non_wetland,123,POINT (2316045.200 1099326.497)
3,non_wetland,123,POINT (2315951.095 1098561.036)
4,non_wetland,123,POINT (2324592.661 1117987.396)
...,...,...,...
2605,wetland,122,POINT (2242953.246 1093873.249)
2606,wetland,123,POINT (2273574.684 1065671.153)
2607,wetland,123,POINT (2288194.157 1075320.951)
2608,wetland,121,POINT (2212497.119 1113323.670)


In [9]:
gdf["class_name"].value_counts()

class_name
non_wetland    870
water          870
wetland        870
Name: count, dtype: int64

In [10]:
labels = gdf["class_name"].unique().tolist()
labels.sort()
labels

['non_wetland', 'water', 'wetland']

In [11]:
import json

values = {v: k for k, v in enumerate(labels, start=1)}
values

{'non_wetland': 1, 'water': 2, 'wetland': 3}

In [12]:
gdf["class_name"] = gdf["class_name"].map(values)
gdf.sort_values(by="class_name", inplace=True, ignore_index=True)
gdf.head()

Unnamed: 0,class_name,eco_region,geometry
0,1,122,POINT (2253675.865 1103145.791)
1,1,122,POINT (2214022.512 1162464.506)
2,1,123,POINT (2324040.243 1116636.062)
3,1,123,POINT (2324377.382 1118172.868)
4,1,121,POINT (2215932.595 1119426.371)


In [13]:
import numpy as np

n = len(gdf)
low = 0
high = 1

uniform_dist = np.random.uniform(low, high, n)
gdf = gdf.assign(random=uniform_dist)

In [14]:
# write out a shapefile in processed data/processed
gdf = gdf[["class_name", "random", "geometry"]]
gdf
# write out a lookup

Unnamed: 0,class_name,random,geometry
0,1,0.422666,POINT (2253675.865 1103145.791)
1,1,0.734175,POINT (2214022.512 1162464.506)
2,1,0.223825,POINT (2324040.243 1116636.062)
3,1,0.323916,POINT (2324377.382 1118172.868)
4,1,0.422439,POINT (2215932.595 1119426.371)
...,...,...,...
2605,3,0.920283,POINT (2337201.747 1124249.397)
2606,3,0.913223,POINT (2264059.661 1073711.489)
2607,3,0.877566,POINT (2304429.791 1127844.292)
2608,3,0.685555,POINT (2324031.273 1108775.943)


In [15]:
from pathlib import Path

interm = Path(".").absolute().parent / "data/interim"

out_file_name = interm / "nb_south_3_class_bal.shp"

gdf.to_file(out_file_name, driver="ESRI Shapefile")

In [16]:
from zipfile import ZipFile


processed = Path(".").absolute().parent / "data/processed"
zip_file_name = processed / "nb_south_3_class_bal.zip"

files = interm.glob("nb_south_*")


with ZipFile(zip_file_name, "w") as zipf:
    for file in files:
        zipf.write(file, arcname=file.name)