## Libraries
This cell imports all the core libraries required in the notebook.

In [1]:
# %pip install numpy pandas shapely pyarrow rasterio tqdm geopandas keras joblib geojson tensorflow
import numpy as np
import pandas as pd
import shapely
import geopandas as gpd
import pyarrow.parquet as pq

from shapely.geometry import mapping, shape
from rasterio.mask import mask
import rasterio
from tqdm import tqdm

import geojson
import keras
import joblib

2025-10-03 17:42:17.368934: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-03 17:42:17.437941: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-03 17:42:19.172989: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


## Define input file paths
The raster file is from GHS-SMOD: https://human-settlement.emergency.copernicus.eu/download.php?ds=smod

In [2]:
name = "Churu"
tif_tile = "GHS_SMOD_E2030_GLOBE_R2023A_54009_1000_V2_0.tif"
state_polygon = "Churu_boundary.geojson"
parquet_path = "Churu_height_NEW_GHSL_SMOD_elevations.parquet"
model_file_name = r"model/informal_settlements_model_wotrees_50m_all_tiles.keras"

## Extract urban areas
This cell loads the polygon (boundary geojson) and SMOD raster files, reprojects polygon to raster crs, clips raster file to the polygon, extracts urban centers from the raster, and saves as vector polygon reprojected to WGS84(EPSG:4326). 

In [3]:
state_gdf = gpd.read_file(state_polygon)


with rasterio.open(tif_tile) as src:
    raster_crs = src.crs
    if state_gdf.crs != raster_crs:
        state_gdf = state_gdf.to_crs(raster_crs)

    state_geom = [state_gdf.geometry.iloc[0]]
    out_image, out_transform = rasterio.mask.mask(src, state_geom, crop=True)

#urban_mask = ((out_image == 30) | (out_image == 23)).astype("uint8") 
urban_mask = (out_image == 30).astype("uint8")
features = []
for s, v in rasterio.features.shapes(urban_mask, transform=out_transform):
    if v == 1:
        features.append({"geometry": shape(s), "value": v})

urban_gdf = gpd.GeoDataFrame(features, geometry="geometry", crs=state_gdf.crs)


urban_gdf = urban_gdf.to_crs(epsg=4326)
urban_gdf.to_file(f"urban_centers_{name}.geojson", driver="GeoJSON")



## CUTTING TO SQUARES
The class provides two methods to divide polygon/geojson into smaller square/rectangular grid cells 

In [4]:
class GridGenerator:

    def rectangles_inside_polygon(self, polygon, n=None, size=None, tol=0, clip=True, include_poly=False) -> gpd.geoseries.GeoSeries:
            assert (n is None and size is not None) or (n is not None and size is None)
            # Extract bounding box coordinates of the polygon
            
            a, b, c, d = gpd.GeoSeries(polygon).total_bounds
            

            # Generate grids along x-axis/y-axis on the n or size
            if not n is None:
                xa = np.linspace(a, c, n + 1)
                ya = np.linspace(b, d, n + 1)
            else:
                xa = np.arange(a, c + 1, size[0])
                ya = np.arange(b, d + 1, size[1])

            # Offsets for tolerance to prevent edge cases
            if tol != 0:
                tol_xa = np.arange(0, tol * len(xa), tol)
                tol_ya = np.arange(0, tol * len(ya), tol)

            else:
                tol_xa = np.zeros(len(xa))
                tol_ya = np.zeros(len(ya))

            # Combine placements of x&y with tolerance
            xat = np.repeat(xa, 2)[1:] + np.repeat(tol_xa, 2)[:-1]
            yat = np.repeat(ya, 2)[1:] + np.repeat(tol_ya, 2)[:-1]

            # Create a grid
            grid = gpd.GeoSeries(
                [
                    shapely.geometry.box(minx, miny, maxx, maxy)
                    for minx, maxx in xat[:-1].reshape(len(xa) - 1, 2)
                    for miny, maxy in yat[:-1].reshape(len(ya) - 1, 2)
                ]
            )

            # Ensure all returned polygons are within boundary
            if clip:
                # grid = grid.loc[grid.within(gpd.GeoSeries(np.repeat([polygon], len(grid))))]
                grid = gpd.sjoin(
                    gpd.GeoDataFrame(geometry=grid),
                    gpd.GeoDataFrame(geometry=[polygon]),
                    how="inner",
                    predicate="within",
                )["geometry"]
            # useful for visualisation
            if include_poly:
                grid = pd.concat(
                    [
                        grid,
                        gpd.GeoSeries(
                            polygon.geoms
                            if isinstance(polygon, shapely.geometry.MultiPolygon)
                            else polygon
                        ),
                    ]
                )
            return grid

    def rectangles_inside_geojson(geojson_data, size):
        """
        Generate a grid of rectangles fully inside all rectangular polygons in a GeoJSON FeatureCollection.

        Args:
            geojson_data (dict): GeoJSON FeatureCollection with rectangular polygons.
            size (tuple): (width, height) of each rectangle.

        Returns:
            gpd.GeoSeries: Grid of rectangles clipped to the original polygons.
        """
        dx, dy = size
        all_polygons = []

        # Extract all polygons/multipolygons from GeoJSON
        for feature in geojson_data.get("features", []):
            geom_type = feature["geometry"]["type"]
            coords = feature["geometry"]["coordinates"]

            if geom_type == "Polygon":
                all_polygons.append(shapely.geometry.Polygon(coords[0]))
            elif geom_type == "MultiPolygon":
                all_polygons.extend([shapely.geometry.Polygon(p[0]) for p in coords])

        # Combine into a single GeoDataFrame
        poly_gdf = gpd.GeoDataFrame(geometry=all_polygons)

        # Generate rectangles for each polygon's bounding box
        rectangles = []
        for poly in all_polygons:
            minx, miny, maxx, maxy = poly.bounds
            x_coords = np.arange(minx, maxx, dx)
            y_coords = np.arange(miny, maxy, dy)

            for x in x_coords:
                for y in y_coords:
                    rect = shapely.geometry.box(x, y, min(x + dx, maxx), min(y + dy, maxy))
                    rectangles.append(rect)

        # Build GeoDataFrame of all rectangles
        rect_gdf = gpd.GeoDataFrame(geometry=rectangles)

        # Clip using spatial join (returns only those within any polygon)
        clipped = gpd.sjoin(rect_gdf, poly_gdf, how="inner", predicate="within")

        return clipped["geometry"]

The function reads a geojson/shapefile, extracts bbox coordinates for each polygon, and returns a geodataframe  (ie prepares polygon data with both geometry and bounding box metadata)

In [5]:
def prepare_data(file):
    # Read JSON
    gdf = gpd.read_file(file)
    # print(gdf["geometry"])

    records = []

    for i, polygon in enumerate(gdf["geometry"]):
        # Get bounds: [minx, miny, maxx, maxy]
        minx, miny, maxx, maxy = polygon.bounds
        records.append(
            {
                "id": str(minx) + ":" + str(miny),
                "geometry": polygon,
                "min_lon": minx,
                "max_lon": maxx,
                "min_lat": miny,
                "max_lat": maxy,
            }
        )

    # Create new GeoDataFrame with geometry and bounds
    bounds_gdf = gpd.GeoDataFrame(records)

    return bounds_gdf

The function takes a shapley polygon and converts into a geojson feature string 

In [6]:
def convert_polygon(polygon):
    # Convert WKT to Shapely geometry
    from shapely.geometry import mapping

    # Convert to GeoJSON dict
    geojson_obj = geojson.Feature(geometry=mapping(polygon))

    # Print or export to paste into geojson.io
    #print(geojson.dumps(geojson_obj))

    return geojson.dumps(geojson_obj)

Creates an instance for the GridGenerator class defined earlier, points to the urban areas file, and defines the grid cell size for the squares

In [7]:
grid = GridGenerator()
# define polygon(s) to prepare in geojson format
file = f"urban_centers_{name}.geojson"
# define type of data formal/informal/real 
classification = "real"

# define cut size 50mx50m or 100mx100m in int format 50/100 (in case of different size, values in next steps (lat,lon) needs to be changed/added)
cut_size = 50

In [8]:
#read the choosen area to cut into squares in geojson format
polygons = gpd.read_file(file)
all_grids = []
polygons

Unnamed: 0,value,geometry
0,1.0,"POLYGON ((75.37144 28.65487, 75.36006 28.62937..."
1,1.0,"POLYGON ((74.48493 28.45943, 74.47749 28.44244..."
2,1.0,"POLYGON ((74.94019 28.31506, 74.93648 28.30657..."
3,1.0,"POLYGON ((74.61767 28.0944, 74.61401 28.08592,..."
4,1.0,"POLYGON ((74.45805 27.72138, 74.45086 27.70444..."


In [9]:
#cutting into squares - size defined in 1st tile
total = len(polygons)

for idx, polygon in enumerate(polygons["geometry"]):
    progress = f"{idx + 1}/{total}"
    print(f"Processing polygon {progress}...")

    if cut_size == 50:
        inside_grid = grid.rectangles_inside_polygon(polygon=polygon, size=(0.000451369, 0.00045121))
    elif cut_size == 100:
        inside_grid = grid.rectangles_inside_polygon(polygon=polygon, size=(0.000902738, 0.00090242))

    all_grids.extend(list(inside_grid))

gdf = gpd.GeoDataFrame(all_grids, columns=["geometry"])
cut_file_name = f'{file}_cut_{cut_size}m.geojson'
cut_file = gdf.to_file(cut_file_name, driver='GeoJSON')

Processing polygon 1/5...
Processing polygon 2/5...
Processing polygon 3/5...
Processing polygon 4/5...
Processing polygon 5/5...


  write(


In [10]:
cut_file_name = f'{file}_cut_{cut_size}m.geojson'
grid_gdf = gpd.read_file(cut_file_name)

In [11]:
# Insert placeholder columns 
grid_gdf.insert(0, "minx", 0)
grid_gdf.insert(1, "miny", 0)
grid_gdf.insert(2, "maxx", 0)
grid_gdf.insert(3, "maxy", 0)

# extract bbox coordinates fro each grid cell
grid_gdf["minx"] = grid_gdf[["geometry"]].apply(lambda poly: poly.geometry.bounds[0], axis = 1)
grid_gdf["miny"] = grid_gdf[["geometry"]].apply(lambda poly: poly.geometry.bounds[1], axis = 1)
grid_gdf["maxx"] = grid_gdf[["geometry"]].apply(lambda poly: poly.geometry.bounds[2], axis = 1)
grid_gdf["maxy"] = grid_gdf[["geometry"]].apply(lambda poly: poly.geometry.bounds[3], axis = 1)
grid_gdf.sort_values(by=['maxx'])
grid_gdf['maxx'].value_counts()

maxx
74.962658    94
74.962206    94
74.964012    94
74.964463    94
74.963109    94
             ..
75.403391     2
74.437420     1
74.937381     1
74.531133     1
74.463879     1
Name: count, Length: 636, dtype: int64

- Loads the building dataset from parquet and reads the essential columns (should match the exact names in the dataset to avoid errors since Pandas is case sensitive).
- Runs a function to compute statistics (building count, average area, maximum building area, average height) per grid cell.
- merges computed buildings statistics into a GeoDataFrame. 

In [12]:
#buildings_df = pd.read_parquet(parquet_path, columns=['LATITUDE', 'LONGITUDE', 'AREA_IN_METERS', 'HEIGHT'])
buildings_df = pd.read_parquet(parquet_path, columns=['latitude', 'longitude', 'area_in_meters', 'height'])

#buildings_gdf = gpd.GeoDataFrame(buildings_df, geometry=buildings_df['POLYGON_COORDINATES'].apply(shapely.wkt.loads))
#buildings_gdf = gpd.GeoDataFrame(buildings_df, geometry=gpd.GeoSeries.from_wkb(buildings_df["POLYGON_COORDINATES"]), crs="EPSG:4326")
print(buildings_df.columns)

def compute_building_stats(cell_miny, cell_maxy, universe):
    #print(cell.bounds[0],cell.bounds[2],cell.bounds[1],cell.bounds[3])
    subset = universe[(cell_miny <= universe.latitude) & (universe.latitude  <= cell_maxy)]
    if subset.empty:
        return {
            "count": 0,
            "avg_area": 0,
            "max_area": 0,
            "avg_height": 0
        }
    return {
        "count": len(subset),
        "avg_area": subset["area_in_meters"].mean(skipna=True),
        "max_area": subset["area_in_meters"].max(skipna=True),
        "avg_height": subset["height"].mean(skipna=True)
    }

results = []
curr_maxx = 0
slice_df = []
for _, row in tqdm(grid_gdf.iterrows(), total=len(grid_gdf), desc="Computing building stats"):
    if row.maxx != curr_maxx:
        slice_df = buildings_df[(row.minx <= buildings_df.longitude) & (buildings_df.longitude <= row.maxx)]
        curr_maxx = row.maxx
    stats = compute_building_stats(row.miny, row.maxy, slice_df)
    results.append(stats)

stats_gdf = gpd.GeoDataFrame(results)
grid_gdf = pd.concat([grid_gdf, stats_gdf], axis=1)

print(grid_gdf.head())

Index(['latitude', 'longitude', 'area_in_meters', 'height'], dtype='object')


Computing building stats: 100%|██████████| 32760/32760 [00:16<00:00, 1944.97it/s]


        minx       miny       maxx       maxy  \
0  75.360511  28.629439  75.360963  28.629890   
1  75.360511  28.629890  75.360963  28.630341   
2  75.360963  28.629439  75.361414  28.629890   
3  75.360963  28.629890  75.361414  28.630341   
4  75.360963  28.630341  75.361414  28.630793   

                                            geometry  count  avg_area  \
0  POLYGON ((75.36096 28.62944, 75.36096 28.62989...      0       0.0   
1  POLYGON ((75.36096 28.62989, 75.36096 28.63034...      0       0.0   
2  POLYGON ((75.36141 28.62944, 75.36141 28.62989...      0       0.0   
3  POLYGON ((75.36141 28.62989, 75.36141 28.63034...      0       0.0   
4  POLYGON ((75.36141 28.63034, 75.36141 28.63079...      0       0.0   

   max_area  avg_height  
0       0.0         0.0  
1       0.0         0.0  
2       0.0         0.0  
3       0.0         0.0  
4       0.0         0.0  


This cell prepares data using the same steps applied in the model training stage:
- drops non-feature columns to retain only statistcal attributes
- fills missing values with 0
- normalizes the features as done during training 

In [13]:
real_df = grid_gdf
real_df_adjusted = real_df.drop(
    columns=[
        "geometry",
        "minx",
        "miny",
        "maxx",
        "maxy"
    ]
)

real_df_cleaned = real_df_adjusted.fillna(0)

# print(real_df_cleaned.columns)
# print(real_df_cleaned.head(5))

# Normalize data
real_df_cleaned["avg_area_norm"] = real_df_cleaned["avg_area"] / 3000
real_df_cleaned.loc[real_df_cleaned["avg_area"] > 3000, "avg_area_norm"] = 1

real_df_cleaned["max_area_norm"] = real_df_cleaned["max_area"] / 3000
real_df_cleaned.loc[real_df_cleaned["max_area"] > 3000, "max_area_norm"] = 1

real_df_cleaned["avg_height_norm"] = real_df_cleaned["avg_height"] / 100
real_df_cleaned.loc[real_df_cleaned["avg_height"] > 100, "avg_height_norm"] = 1


real_df_cleaned_adj = real_df_cleaned.drop(columns=["avg_area", "max_area", "avg_height"])
print(real_df_cleaned_adj)

       count  avg_area_norm  max_area_norm  avg_height_norm
0          0            0.0            0.0              0.0
1          0            0.0            0.0              0.0
2          0            0.0            0.0              0.0
3          0            0.0            0.0              0.0
4          0            0.0            0.0              0.0
...      ...            ...            ...              ...
32755      0            0.0            0.0              0.0
32756      0            0.0            0.0              0.0
32757      0            0.0            0.0              0.0
32758      0            0.0            0.0              0.0
32759      0            0.0            0.0              0.0

[32760 rows x 4 columns]


In [14]:
# Summary statistics per grid cell
print(real_df_cleaned_adj['count'].describe())
print(real_df_cleaned_adj['avg_area_norm'].describe())
print(real_df_cleaned_adj['max_area_norm'].describe())
print(real_df_cleaned_adj['avg_height_norm'].describe())

count    32760.000000
mean         4.865537
std          5.475527
min          0.000000
25%          0.000000
50%          3.000000
75%          8.000000
max         33.000000
Name: count, dtype: float64
count    32760.000000
mean         0.019426
std          0.035564
min          0.000000
25%          0.000000
50%          0.015460
75%          0.025668
max          1.000000
Name: avg_area_norm, dtype: float64
count    32760.000000
mean         0.037413
std          0.052382
min          0.000000
25%          0.000000
50%          0.031808
75%          0.053273
max          1.000000
Name: max_area_norm, dtype: float64
count    32760.000000
mean         0.035150
std          0.026471
min          0.000000
25%          0.000000
50%          0.045000
75%          0.052843
max          0.225000
Name: avg_height_norm, dtype: float64


In [15]:
# Load feature columns from training
#feature_columns = joblib.load("model/feature_columns.pkl")
#print(feature_columns)

# Align columns with training data
#new_df = real_df_cleaned_adj.reindex(columns=feature_columns, fill_value=0)  # Ensure same structure

print(real_df_cleaned_adj.head(10))

# Load the scaler
scaler = joblib.load("model/scaler.save")
print(scaler)

# Scale using the SAME scaler (don’t fit again)
new_scaled = scaler.transform(real_df_cleaned_adj)
new_scaled

   count  avg_area_norm  max_area_norm  avg_height_norm
0      0            0.0            0.0              0.0
1      0            0.0            0.0              0.0
2      0            0.0            0.0              0.0
3      0            0.0            0.0              0.0
4      0            0.0            0.0              0.0
5      0            0.0            0.0              0.0
6      0            0.0            0.0              0.0
7      0            0.0            0.0              0.0
8      0            0.0            0.0              0.0
9      0            0.0            0.0              0.0
StandardScaler()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


array([[-0.86933781, -0.59985466, -0.81350394, -0.98895246],
       [-0.86933781, -0.59985466, -0.81350394, -0.98895246],
       [-0.86933781, -0.59985466, -0.81350394, -0.98895246],
       ...,
       [-0.86933781, -0.59985466, -0.81350394, -0.98895246],
       [-0.86933781, -0.59985466, -0.81350394, -0.98895246],
       [-0.86933781, -0.59985466, -0.81350394, -0.98895246]],
      shape=(32760, 4))

In [16]:
# Load the pre-trained model and perform classification
model = keras.models.load_model(model_file_name)
pred_probs = model.predict(new_scaled)

percentages = (pred_probs * 100).round(2).flatten()  

pred_classes = (pred_probs > 0.6).astype(int)  

[1m   1/1024[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m57s[0m 57ms/step

2025-10-03 17:55:10.987068: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m1024/1024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681us/step


In [17]:
predicted_classes = ["formal" if label == 0 else "informal" for label in pred_classes]
real_df["prediction"] = predicted_classes
real_df["confidence"] = pred_probs

real_data_adjusted_df = real_df
real_data_adjusted_df

Unnamed: 0,minx,miny,maxx,maxy,geometry,count,avg_area,max_area,avg_height,prediction,confidence
0,75.360511,28.629439,75.360963,28.629890,"POLYGON ((75.36096 28.62944, 75.36096 28.62989...",0,0.0,0.0,0.0,formal,1.805664e-35
1,75.360511,28.629890,75.360963,28.630341,"POLYGON ((75.36096 28.62989, 75.36096 28.63034...",0,0.0,0.0,0.0,formal,1.805664e-35
2,75.360963,28.629439,75.361414,28.629890,"POLYGON ((75.36141 28.62944, 75.36141 28.62989...",0,0.0,0.0,0.0,formal,1.805664e-35
3,75.360963,28.629890,75.361414,28.630341,"POLYGON ((75.36141 28.62989, 75.36141 28.63034...",0,0.0,0.0,0.0,formal,1.805664e-35
4,75.360963,28.630341,75.361414,28.630793,"POLYGON ((75.36141 28.63034, 75.36141 28.63079...",0,0.0,0.0,0.0,formal,1.805664e-35
...,...,...,...,...,...,...,...,...,...,...,...
32755,74.492487,27.702937,74.492938,27.703388,"POLYGON ((74.49294 27.70294, 74.49294 27.70339...",0,0.0,0.0,0.0,formal,1.805664e-35
32756,74.492487,27.703388,74.492938,27.703839,"POLYGON ((74.49294 27.70339, 74.49294 27.70384...",0,0.0,0.0,0.0,formal,1.805664e-35
32757,74.492487,27.703839,74.492938,27.704290,"POLYGON ((74.49294 27.70384, 74.49294 27.70429...",0,0.0,0.0,0.0,formal,1.805664e-35
32758,74.492938,27.703388,74.493389,27.703839,"POLYGON ((74.49339 27.70339, 74.49339 27.70384...",0,0.0,0.0,0.0,formal,1.805664e-35


In [18]:
#toto bude upravovane

#real_data_filtered_df = real_data_adjusted_df[
#    real_data_adjusted_df["prediction"] == "informal"
#]
real_data_filtered_df = real_data_adjusted_df
real_data_filtered_df

Unnamed: 0,minx,miny,maxx,maxy,geometry,count,avg_area,max_area,avg_height,prediction,confidence
0,75.360511,28.629439,75.360963,28.629890,"POLYGON ((75.36096 28.62944, 75.36096 28.62989...",0,0.0,0.0,0.0,formal,1.805664e-35
1,75.360511,28.629890,75.360963,28.630341,"POLYGON ((75.36096 28.62989, 75.36096 28.63034...",0,0.0,0.0,0.0,formal,1.805664e-35
2,75.360963,28.629439,75.361414,28.629890,"POLYGON ((75.36141 28.62944, 75.36141 28.62989...",0,0.0,0.0,0.0,formal,1.805664e-35
3,75.360963,28.629890,75.361414,28.630341,"POLYGON ((75.36141 28.62989, 75.36141 28.63034...",0,0.0,0.0,0.0,formal,1.805664e-35
4,75.360963,28.630341,75.361414,28.630793,"POLYGON ((75.36141 28.63034, 75.36141 28.63079...",0,0.0,0.0,0.0,formal,1.805664e-35
...,...,...,...,...,...,...,...,...,...,...,...
32755,74.492487,27.702937,74.492938,27.703388,"POLYGON ((74.49294 27.70294, 74.49294 27.70339...",0,0.0,0.0,0.0,formal,1.805664e-35
32756,74.492487,27.703388,74.492938,27.703839,"POLYGON ((74.49294 27.70339, 74.49294 27.70384...",0,0.0,0.0,0.0,formal,1.805664e-35
32757,74.492487,27.703839,74.492938,27.704290,"POLYGON ((74.49294 27.70384, 74.49294 27.70429...",0,0.0,0.0,0.0,formal,1.805664e-35
32758,74.492938,27.703388,74.493389,27.703839,"POLYGON ((74.49339 27.70339, 74.49339 27.70384...",0,0.0,0.0,0.0,formal,1.805664e-35


In [19]:
real_data_filtered_df["polygon"] = real_data_filtered_df["geometry"]
real_data_filtered_df.to_parquet(f"{name}_neuralNetwork_informal_detection.parquet")

Builds a list of GeoJSON features from the model's prediction results and maps based on the confidence obtained (ie >0.6 == green, =<0.6 == blue)

In [20]:
# Build GeoJSON features

color_map = {
    "high": "#00cd00", # green
    "mid": "#ff4d4d", # red
    "low": "#1591EA", # blue
}     

features = []
for _, row in real_data_filtered_df.iterrows():
    #print(row)

    confidence = row["confidence"]
    color = (
        color_map["high"] if confidence > 0.6
        else color_map["low"]
    )

    
    feature = {
        "type": "Feature",
        "properties": {
            "confidence": row["confidence"],
            "class": row["prediction"],
            # "trees": row["num_pixels_ge_3m"],
            "fill": color,
            "stroke": color
        },
        "geometry": mapping(row["polygon"]),
    }
    features.append(feature)

In [21]:
# Save to file
import json
# Wrap in a FeatureCollection
geojson_dict = {"type": "FeatureCollection", "features": features}

output_file = f"{name}_neuralNetwork_50_infromal_detection.geojson"
with open(output_file,
    "w",
) as f:
    json.dump(geojson_dict, f, indent=2)

In [None]:
# Optional- upload to COS bucket
# Read notebook configuration
import json

#config_str = getpass.getpass('Enter your prepared config: ')
config_str = '''

{

    "COS_ENDPOINT_URL": "",
    "COS_AUTH_ENDPOINT_URL": "",
    "COS_APIKEY": "",
    "OUTPUT_BUCKET": ""

}

'''
config = json.loads(config_str)

import ibm_boto3
from ibm_botocore.client import Config

cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])
try:
    parquet_file = f"{name}_neuralNetwork_informal_detection.parquet"
    geojson_file = f"{name}_neuralNetwork_50_infromal_detection.geojson"

    # Put both into a list and loop
    for output_file in [parquet_file, geojson_file]:
        cos_client.upload_file(Filename=output_file, Bucket=config["OUTPUT_BUCKET"], Key=output_file)
        print(f"{output_file} successfully uploaded")
        
except Exception as e:
    print("Upload failed:", e)


Churu_neuralNetwork_informal_detection.parquet successfully uploaded
Churu_neuralNetwork_50_infromal_detection.geojson successfully uploaded


In [25]:
# AOptional - check files in bucket 
# Bucket name (custom output bucket)
bucket_name = "onboarding-bucket-s"

# List objects in the bucket
response = cos_client.list_objects_v2(Bucket=bucket_name)

if "Contents" in response:
    print(f"Files in bucket '{bucket_name}':")
    for obj in response["Contents"]:
        print(" -", obj["Key"])
else:
    print(f"No files found in bucket '{bucket_name}'.")

Files in bucket 'onboarding-bucket-s':
 - Churu_buildings.parquet
 - Churu_detailed.json
 - Churu_height_NEW.parquet
 - Churu_height_NEW_GHSL_SMOD.parquet
 - Churu_height_NEW_GHSL_SMOD_elevations.parquet
 - Churu_neuralNetwork_50_infromal_detection.geojson
 - Churu_neuralNetwork_informal_detection.parquet
 - Churu_overview.json
 - Churu_tile_-JHrpnwlsB0.tif
 - Churu_tile_-KL47QoJpaY.tif
 - Churu_tile_-KymsFZ0x0U.tif
 - Churu_tile_02anevWbkGQ.tif
 - Churu_tile_07yg36MiT1M.tif
 - Churu_tile_0CJKHih6724.tif
 - Churu_tile_0aVDjbwC8ic.tif
 - Churu_tile_0dXj1zB2734.tif
 - Churu_tile_0kvbmfnsaNE.tif
 - Churu_tile_132qayxlKpI.tif
 - Churu_tile_15x1nnRoIrM.tif
 - Churu_tile_1BKBvA73s4w.tif
 - Churu_tile_1U5BOyN8CbA.tif
 - Churu_tile_1VXYkm9xW7M.tif
 - Churu_tile_1jVbQlZqgmo.tif
 - Churu_tile_1ovLetjRVwc.tif
 - Churu_tile_2L_UKlL8VAI.tif
 - Churu_tile_2MLZEZ6EMVM.tif
 - Churu_tile_2Uks7tWdnag.tif
 - Churu_tile_2YNDxTMyVRc.tif
 - Churu_tile_3RyxytdiqWk.tif
 - Churu_tile_3ZTjsLU_qJw.tif
 - Churu_t