# Sampling

The satellite imagery data used in this project was obtained from Sentinel-2 using Google Earth Engine (GEE). 
You can find the code used for data extraction here: [GEE Sentinel-2 Extraction Code](https://code.earthengine.google.com/c11abab413d190dc51fb51bb76540713) and [GEE Sentienl-2 Augmentation code](https://code.earthengine.google.com/4b8c43c9e7234ece1257f80c8d7cf56b?hl=fr)

After downloading, the imagery was divided into 256x256 pixel tiles using QGIS to facilitate processing and annotation.

In the following sections, we will continue with the labelling process and prepare the datasets for training, validation, and testing.


## Setup 

In [None]:
pip install scikit-image

In [2]:
import pandas as pd
import geopandas as gpd
import rasterio
import numpy as np
import os
import glob
from skimage.feature import graycomatrix, graycoprops
import geopandas as gpd
from rasterio.features import rasterize
from shapely.geometry import box
from pathlib import Path
import shapely.wkt
import matplotlib.pyplot as plt
from rasterio.merge import merge
import math



## Overview

In [None]:
# County, Prefecture, Tiff, Tiles data
data = [
    {"County": "Shuangfeng", "Prefecture": "Loudi", "Tiff": 4, "Tiles": 725},
    {"County": "Baihe", "Prefecture": "Ankang", "Tiff": 2, "Tiles": 475},
    {"County": "Hanbin", "Prefecture": "Ankang", "Tiff": 9, "Tiles": 1872},
    {"County": "Yiyuan", "Prefecture": "Zibo", "Tiff": 4, "Tiles": 806},
    {"County": "Cao", "Prefecture": "Heze", "Tiff": 4, "Tiles": 891},
    {"County": "Yuechi", "Prefecture": "Guang'an", "Tiff": 4, "Tiles": 729},
    {"County": "Jingyang", "Prefecture": "LiangshanYi", "Tiff": 4, "Tiles": 725},
    {"County": "LanpingBaiandPumi", "Prefecture": "NujiangLisu", "Tiff": 6, "Tiles": 1440},
    {"County": "Yongshan", "Prefecture": "Zhaotong", "Tiff": 6, "Tiles": 1700},
    {"County": "Yanjin", "Prefecture": "Xinxiang", "Tiff": 4, "Tiles": 420},
    {"County": "Yuanling", "Prefecture": "Huaihua", "Tiff": 9, "Tiles": 2068},
    {"County": "Sheqi", "Prefecture": "Nanyang", "Tiff": 2, "Tiles": 380},
    {"County": "Gushi", "Prefecture": "Xinyang", "Tiff": 6, "Tiles": 1118},
    {"County": "Xuanhua", "Prefecture": "Zhangjiakou", "Tiff": 4, "Tiles": 1368},
    {"County": "Daming", "Prefecture": "Handan", "Tiff": 4, "Tiles": 529},
    {"County": "Heshui", "Prefecture": "Qingyang", "Tiff": 6, "Tiles": 1672},
]

df = pd.DataFrame(data)

In [None]:
print(df['Tiles'].sum())

In [None]:
total_sample_tiles_1000 = 1000 
total_sample_tiles_2000 = 2000
total_sample_tiles_5000 = 5000
total_sample_tiles_10000 = 10000

# Create a new DataFrame for proportional sampling
df_proportion = df.copy()
df_proportion['Proportion'] = df_proportion['Tiles'] / df_proportion['Tiles'].sum()
df_proportion['SampledTiles_1000'] = (df_proportion['Proportion'] * total_sample_tiles_1000).round().astype(int)
df_proportion['SampledTiles_2000'] = (df_proportion['Proportion'] * total_sample_tiles_2000).round().astype(int)
df_proportion['SampledTiles_5000'] = (df_proportion['Proportion'] * total_sample_tiles_5000).round().astype(int)
df_proportion['SampledTiles_10000'] = (df_proportion['Proportion'] * total_sample_tiles_10000).round().astype(int)

print(df_proportion[['County', 'Tiles', 'Proportion', 'SampledTiles_1000','SampledTiles_2000','SampledTiles_5000','SampledTiles_10000']])


## Find positive Sample

In [None]:
focus_counties = pd.read_csv('../../focus_counties.csv')

focus_counties

In [None]:
chinapv_vectorized_2020 = gpd.read_file('../../data/data_processed/chinapv_vectorized_2020.geojson')


In [None]:
# Adjust county names for special cases before matching
chinapv_vectorized_2020_adj = chinapv_vectorized_2020.copy()
chinapv_vectorized_2020_adj['County'] = chinapv_vectorized_2020_adj.apply(
    lambda row: 'Ankang' if row['County'].lower() == 'hanbin' else (
        'Qingyang' if row['County'].lower() == 'heshui' else row['County']
    ),
    axis=1
)

positive_samples= chinapv_vectorized_2020_adj[
    chinapv_vectorized_2020_adj[['Province', 'Prefecture', 'County']].apply(
        lambda row: ((row['Province'], row['Prefecture'], row['County'])) in 
        list(zip(focus_counties['province'], focus_counties['prefecture'], focus_counties['county'])), axis=1)
]

In [None]:
positive_samples.head()

In [None]:
positive_samples.info()

In [None]:
positive_samples_augmented= chinapv_vectorized_2020[
    chinapv_vectorized_2020[['Province', 'Prefecture']].apply(
        lambda row: ((row['Province'], row['Prefecture'])) in 
        list(zip(focus_counties['province'], focus_counties['prefecture'])), axis=1)
]

In [None]:
positive_samples_augmented.head()

In [None]:
positive_samples_augmented.info()

In [None]:
# Group by "Prefecture" and count the number of solar panel samples per prefecture

# Count the number of samples per prefecture in each DataFrame
nb_prefecture = positive_samples.groupby("Prefecture")['geometry'].count().rename('nb_positive')
nb_prefecture_augmented = positive_samples_augmented.groupby("Prefecture")['geometry'].count().rename('nb_augmented')

# Combine the counts into a single DataFrame
comparaison_augmented = pd.concat([nb_prefecture, nb_prefecture_augmented], axis=1)

# Replace NaN values with 0
comparaison_augmented = comparaison_augmented.fillna(0)

# Calculate the difference
comparaison_augmented['difference'] = comparaison_augmented['nb_augmented'] - comparaison_augmented['nb_positive']

comparaison_augmented

In [None]:
comparaison_augmented.sum()

In [None]:
# Ensure 'Tiles' column is added correctly by aligning index
comparaison_augmented['Tiles'] = df.groupby("Prefecture")['Tiles'].sum()

# Calculate the proportion of tiles per prefecture
comparaison_augmented['Proportion'] = comparaison_augmented['Tiles'] / comparaison_augmented['Tiles'].sum()

# Select only the 'Proportion' and 'nb_augmented' columns for display
comparaison_augmented = comparaison_augmented[['Proportion', 'nb_augmented']]

total_sample = comparaison_augmented['nb_augmented'].sum()
comparaison_augmented['sample_proportional'] =  (comparaison_augmented['Proportion'] * total_sample).round().astype(int)

comparaison_augmented

The difference between the number of samples and the number of proportional samples is sometimes quite large.
For example, some prefectures like Zhangjiakou are oversampled, while others like Ankang are poorly sampled.
To address this imbalance:
- For oversampled prefectures such as Zhangjiakou, we can keep only the urban samples, since our goal is to improve performance on residential/rooftop solar panels.
- For under-sampled prefectures, we can apply data augmentation or transformations to increase their sample size.


In [None]:
# For prefecture 'Zhangjiakou', keep only urban=1 for counties that are not the focus county
focus_county = focus_counties.loc[focus_counties['prefecture'] == 'Zhangjiakou', 'county'].iloc[0]

print('Focus county for Zhangjiakou:', focus_county)

# Use positive_samples_augmented for filtering, not positive_samples
mask = ~(
    (positive_samples_augmented['Prefecture'] == 'Zhangjiakou') &
    (positive_samples_augmented['County'] != focus_county) &
    (positive_samples_augmented['urban'] != 1)
)
positive_samples_correction = positive_samples_augmented[mask]

In [None]:
positive_samples_correction

In [None]:
nb_prefecture = positive_samples.groupby("Prefecture")['geometry'].count().rename('nb_positive')
nb_prefecture_augmented = positive_samples_augmented.groupby("Prefecture")['geometry'].count().rename('nb_augmented')
nb_prefecture_augmented_correction = positive_samples_correction.groupby("Prefecture")['geometry'].count().rename('nb_augmented_correction')

# Combine the counts into a single DataFrame
comparaison_correction = pd.concat([nb_prefecture, nb_prefecture_augmented, nb_prefecture_augmented_correction], axis=1)

# Replace NaN values with 0
comparaison_correction = comparaison_correction.fillna(0)

# Calculate the difference
comparaison_correction['difference'] = comparaison_correction['nb_augmented_correction'] - comparaison_correction['nb_positive']

comparaison_correction

In [None]:
comparaison_correction.sum()

In [None]:
# Ensure 'Tiles' column is added correctly by aligning index
comparaison_correction['Tiles'] = df.groupby("Prefecture")['Tiles'].sum()

# Calculate the proportion of tiles per prefecture
comparaison_correction['Proportion'] = comparaison_correction['Tiles'] / comparaison_correction['Tiles'].sum()

# Select only the 'Proportion' and 'nb_augmented_correction' columns for display
comparaison_correction = comparaison_correction[['Proportion', 'nb_augmented_correction']]

total_sample = comparaison_correction['nb_augmented_correction'].sum()
comparaison_correction['sample_proportional'] = (comparaison_correction['Proportion'] * total_sample).round().astype(int)

comparaison_correction


The sample distribution is still unbalanced across prefectures, but it is improved compared to before.
Applying further transformations such as data augmentation will also help to balance the dataset.

In [None]:
positive_samples_correction.to_csv('../../data/positive_samples.csv', index=False, encoding="utf-8")

## Clean Tiles

When clipping the satellite images in Google Earth Engine (GEE) to county boundaries, some tiles may fall completely outside the county area. As a result, all the pixels in these tiles are set to NaN (masked). 
These fully-masked tiles do not contain any valid data and should be removed from further analysis to ensure data quality and avoid errors in subsequent processing steps.


In [None]:
# Path to the root directory containing all county folders
root_dir = r"D:\solar-data-china\tiles"

# List all county folders (exclude any folder with 'Augmented' in the name, case-insensitive)
county_folders = [
    d for d in os.listdir(root_dir)
    if os.path.isdir(os.path.join(root_dir, d)) and 'augmented' not in d.lower()
]

mosaic_images = []
county_names = []

for county in county_folders:
    county_path = os.path.join(root_dir, county)
    pattern = f"*{county}*.tif"
    # Recursively find all .tif files for this county
    tiles = glob.glob(os.path.join(county_path, '**', pattern), recursive=True)
    print(f"{len(tiles)} tiles found for {county}")

    if len(tiles) == 0:
        print(f"Warning: No tiles found for county {county} in {county_path}")
        continue

    # Open all tiles for reading
    src_files_to_mosaic = [rasterio.open(fp) for fp in tiles]

    # Merge the rasters (rasterio handles geometry and CRS)
    mosaic, out_trans = merge(src_files_to_mosaic)

    # Get the profile (metadata) from one tile for later use
    out_meta = src_files_to_mosaic[0].meta.copy()

    # Close all opened raster files to avoid resource warnings
    for src in src_files_to_mosaic:
        src.close()

    # Store the first band of the mosaic and county name for grid display
    mosaic_images.append(mosaic[0])
    county_names.append(county)

# Display all mosaics in a 4-column grid

n_files = len(mosaic_images)
n_cols = 4
n_rows = int(np.ceil(n_files / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows))
axes = axes.flatten()

for idx, (img, name) in enumerate(zip(mosaic_images, county_names)):
    ax = axes[idx]
    im = ax.imshow(img, cmap='gray')
    ax.set_title(f"{name}", fontsize=10)
    ax.axis('off')

# Hide any unused subplots
for j in range(idx + 1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()


356 tiles found for Baihe
508 tiles found for Cao
297 tiles found for Daming
717 tiles found for Gushi
1872 tiles found for Hanbin
784 tiles found for Heshui


MemoryError: Unable to allocate 9.18 GiB for an array with shape (23, 11097, 9655) and data type float32

In [44]:
# Path to the root directory containing all folders (e.g., Baihe, Cao Augmented, Daming, etc.)
root_dir = r"D:\solar-data-china\tiles"

In [45]:
for dirpath, dirnames, filenames in os.walk(root_dir):
    nb_nan = 0
    for file in filenames:
        if not file.endswith('.tif'):
            continue
        file_path = os.path.join(dirpath, file)
        try:
            with rasterio.open(file_path) as src:
                tile_img = src.read(1)
                if np.isnan(tile_img).all():
                    nb_nan += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    print("Number of completely NaN tiles in", dirpath, ":", nb_nan)


Number of completely NaN tiles in D:\solar-data-china\tiles : 0
Number of completely NaN tiles in D:\solar-data-china\tiles\Augmented : 0
Number of completely NaN tiles in D:\solar-data-china\tiles\Baihe : 119
Number of completely NaN tiles in D:\solar-data-china\tiles\Cao : 383
Number of completely NaN tiles in D:\solar-data-china\tiles\Daming : 232
Number of completely NaN tiles in D:\solar-data-china\tiles\Gushi : 401
Number of completely NaN tiles in D:\solar-data-china\tiles\Hanbin : 971
Number of completely NaN tiles in D:\solar-data-china\tiles\Heshui : 888
Number of completely NaN tiles in D:\solar-data-china\tiles\Jingyang : 355
Number of completely NaN tiles in D:\solar-data-china\tiles\LanpingBaiandPumi : 501
Number of completely NaN tiles in D:\solar-data-china\tiles\Sheqi : 89
Number of completely NaN tiles in D:\solar-data-china\tiles\Shuangfeng : 324
Number of completely NaN tiles in D:\solar-data-china\tiles\Xuanhua : 597
Number of completely NaN tiles in D:\solar-data-

In [46]:
deleted_count = 0
deleted_files = []

for dirpath, dirnames, filenames in os.walk(root_dir):
    folder_deleted = 0
    for file in filenames:
        if not file.endswith('.tif'):
            continue
        
        file_path = os.path.join(dirpath, file)

        try:
            # Open the file *read-only*
            with rasterio.open(file_path) as src:
                arr = src.read(1)  # first band
                nodata = src.nodata

                if arr.size == 0:
                    is_empty = True
                else:
                    if nodata is not None:
                        is_empty = np.all((arr == nodata) | np.isnan(arr))
                    else:
                        is_empty = np.isnan(arr).all()

            if is_empty:
                os.remove(file_path)
                deleted_count += 1
                deleted_files.append(file_path)
                folder_deleted += 1
                print(f"Deleted: {file_path}")

        except rasterio.errors.RasterioIOError as e:
            print(f"Could not open {file_path}: {e}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    if folder_deleted > 0:
        print(f"\nDeleted {folder_deleted} empty image(s) in folder: {dirpath}\n")

print(f"\nDeleted {deleted_count} empty `.tif` files in total.\n")


Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_01.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_02.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_03.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_04.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_05.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_06.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_07.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_08.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_09.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_14.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_15.tif
Deleted: D:\solar-data-china\tiles\Baihe\s2_Baihe-0000000000-0000000000_01_16.tif
Deleted: D:\sola

## Calculate additional indices

At this stage, I have downloaded my satellite image data from Google Earth Engine (GEE) and used QGIS to cut the large images into smaller tiles for further analysis. 

The next step is to calculate additional spectral indices for each tile, such as NDVI, NDBI, mNDWI, and NSPI, which are commonly used in remote sensing to extract information about vegetation, built-up areas, water bodies and soil properties.

Here is what each band represents:
- B1 to B7: These are the original spectral bands from the satellite imagery (e.g., Landsat or Sentinel), each capturing reflectance in different parts of the electromagnetic spectrum.
- NDVI (Normalized Difference Vegetation Index): Highlights vegetation by comparing the near-infrared (NIR) and red bands. High NDVI values indicate healthy vegetation.
- NDBI (Normalized Difference Built-up Index): Used to identify built-up (urban) areas by comparing the shortwave infrared (SWIR) and NIR bands.
- mNDWI (Modified Normalized Difference Water Index): Enhances open water features by using green and SWIR bands.
- NSPI_1 and NSPI_2 (Normalized Soil and Plant Indices): These are custom indices designed to highlight specific soil or plant characteristics, depending on the band combinations used.

Calculating these indices allows for more effective classification and analysis of land cover types in each tile. 

Note: All of these indices could have been calculated directly within GEE before downloading the data, which would have streamlined the workflow. However, in this case, I am performing the calculations locally after tiling in QGIS.


In [None]:
# Function for normalized difference index
def norm_diff(band1, band2):
    bottom = band1 + band2
    bottom[bottom == 0] = 1e-10  # avoid division by zero
    return (band1 - band2) / bottom

def quantize(arr, levels=64):
    arr_min = np.nanmin(arr)
    arr_max = np.nanmax(arr)
    scaled = ((arr - arr_min) / (arr_max - arr_min) * (levels - 1)).astype(np.uint8)
    return scaled

def calc_sum_average(arr):
    # Compute GLCM (grey level co-occurrence matrix)
    glcm = graycomatrix(arr, distances=[1], angles=[0], levels=64, symmetric=True, normed=True)
    # sum average is sum over (i+j)*p(i,j)
    i, j = np.ogrid[0:64, 0:64]
    sum_avg = np.sum((i + j) * glcm[:, :, 0, 0])
    return sum_avg


# Loop through each folder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    if not os.path.isdir(folder_path):
        continue
    print(f"Processing folder: {folder_name}")

    # Find all .tif files in this folder
    tiff_files = glob.glob(os.path.join(folder_path, "*.tif"))
    print(f" Found {len(tiff_files)} tiff files.")

    files_processed = 0
    files_skipped = 0
    files_error = 0

    for tiff_path in tiff_files:
        tiff_name = os.path.basename(tiff_path)
        try:
            with rasterio.open(tiff_path) as src:
                bands = src.read()  # shape = (n_bands, height, width)
                profile = src.profile

            # Check if there are at least 11 bands
            if bands.shape[0] < 11:
                # print(f"Skipping {tiff_name}: not enough bands ({bands.shape[0]})")
                files_skipped += 1
                continue

            # Extract needed bands (adjust indices accordingly)
            B1 = bands[0]
            B2 = bands[1]
            B3 = bands[2]
            B4 = bands[3]
            B5 = bands[4]
            B6 = bands[5]
            B7 = bands[6]
            B8 = bands[7]
            B9 = bands[8]
            B11 = bands[9]
            B12 = bands[10]

            # Calculate indices
            NDVI = norm_diff(B8, B4)
            NDBI = norm_diff(B11, B8)
            mNDWI = norm_diff(B3, B11)
            NSPI_1 = norm_diff(B11, B12)
            NSPI_2 = norm_diff(B11, B9)

            # Prepare list of bands and indices for texture calculation
            textures_input = {
                'B2': B2,
                'B6': B6,
                'B7': B7,
                'NDVI': NDVI,
                'mNDWI': mNDWI,
                'NDBI': NDBI
            }

            textures = {}
            for name, arr in textures_input.items():
                arr_q = quantize(arr)
                sum_avg = calc_sum_average(arr_q)
                textures[name + '_sumAvg'] = sum_avg

            # Create new stack with original bands + indices 
            new_bands = np.stack([B1, B2, B3, B4, B5, B6, B7, NDVI, NDBI, mNDWI, NSPI_1, NSPI_2])

            profile.update(count=len(new_bands), dtype=rasterio.float32)

            # Output file path
            out_name = tiff_name
            output_folder = folder_name + '_with_indices'
            output_folder_path = os.path.join(root_dir, output_folder)
            os.makedirs(output_folder_path, exist_ok=True)
            out_path = os.path.join(output_folder_path, out_name)

            with rasterio.open(out_path, 'w', **profile) as dst:
                for i, band in enumerate(new_bands, start=1):
                    dst.write(band.astype(rasterio.float32), i)

            files_processed += 1

        except Exception as e:
            # print(f"Error processing {tiff_name}: {e}")
            files_error += 1

    print(f"Finished folder: {folder_name} | Processed: {files_processed} | Skipped: {files_skipped} | Errors: {files_error}")

print("Processing complete.")


## Annotation

In [None]:
solar = gpd.read_file('../../data/positive_samples.csv')

# parse WKT strings explicitly
solar['geometry'] = solar['geometry'].apply(shapely.wkt.loads)

# drop Z coordinate
solar['geometry'] = solar['geometry'].apply(lambda g: shapely.wkt.loads(g.wkt))


solar = solar.set_geometry('geometry')
solar.crs = "EPSG:4326"


### Piwel-wise labelling and saving metadata

In [None]:
# storage
records = []
counts = []

# Loop through each folder
for folder_name in os.listdir(root_dir):
    if not folder_name.endswith("_with_indices"):
        continue

    folder_path = os.path.join(root_dir, folder_name)
    if not os.path.isdir(folder_path):
        continue

    county = folder_name.replace("_with_indices", "")
    nb_partial_with = 0
    nb_partial_without = 0

    tiles = [f for f in os.listdir(folder_path) if f.endswith('.tif')]
    for file in tiles:
        tile_path = os.path.join(folder_path, file)
        with rasterio.open(tile_path) as src:
            tile_crs = src.crs
            bounds = src.bounds
            shape = (src.height, src.width)
            transform = src.transform

            # reproject solar to tile CRS
            solar_tile = solar.to_crs(tile_crs)
            tile_geom = box(*bounds)
            solar_clipped = solar_tile[solar_tile.intersects(tile_geom)]

            if len(solar_clipped) == 0:
                 # no solar panels → mask of zeros
                mask = np.zeros(shape, dtype=np.uint8)
                label = 0 #image_wise label 0 = no PV in this tiles 
                nb_partial_without += 1
            else:
                mask = rasterize(
                    [(geom, 1) for geom in solar_clipped.geometry],
                    out_shape=shape,
                    transform=transform,
                    fill=0,
                    dtype=np.uint8
                ) # all pixels inside solar panels polygons = 1 others=0
                label = 1 #PV tiles 
                nb_partial_with += 1

            # save mask
            mask_folder_name = folder_name.replace("_with_indices", "_mask")
            mask_dir = os.path.join(os.path.dirname(folder_path), mask_folder_name)
            os.makedirs(mask_dir, exist_ok=True)
            tile_base = os.path.splitext(file)[0]
            mask_path = os.path.join(mask_dir, f"{tile_base}.tif")
            with rasterio.open(
                mask_path, 'w',
                driver='GTiff',
                height=shape[0],
                width=shape[1],
                count=1,
                dtype=mask.dtype,
                crs=tile_crs,
                transform=transform
            ) as dst:
                dst.write(mask, 1)

            # record
            records.append({
                'tile_path': tile_path,
                'mask_path': mask_path,
                'county': county,
                'label': label
            })

    counts.append({
        'county': county,
        'with_solar': nb_partial_with,
        'without_solar': nb_partial_without,
        'total': nb_partial_with + nb_partial_without
    })

df = pd.DataFrame(records)
df.to_csv("tiles_metadata.csv", index=False)
print("Metadata CSV saved as tiles_metadata.csv")

counts_df = pd.DataFrame(counts)
print("Counts by county:\n")
print(counts_df)


In [None]:
df = df[(df['label'] == 1) & (df['county'] != "Augmented")]

# Plot 3  examples (6 tiles total), each row: left = original, right = mask
num_examples = 3
num_cols = 2  # left: original, right: mask

num_examples = min(num_examples, len(df))


fig, axs = plt.subplots(num_examples, num_cols, figsize=(12, 6 *num_examples))

for i in range(num_examples):
    row = df.iloc[i]
    # Read the image tile
    with rasterio.open(row['tile_path']) as src:
        tile_img = src.read(1)
    # Read the mask
    with rasterio.open(row['mask_path']) as src:
        mask_img = src.read(1)
    # Left: original
    ax_left = axs[i, 0] if num_examples > 1 else axs[0]
    ax_left.imshow(tile_img, cmap='gray')
    ax_left.set_title(f"Tile {i}: {row['county']} (Label: 1) - Original")
    ax_left.axis('off')
    # Right: mask
    ax_right = axs[i, 1] if num_examples > 1 else axs[1]
    ax_right.imshow(mask_img, cmap='jet')
    ax_right.set_title(f"Tile {i}: {row['county']} (Label: 1) - Mask")
    ax_right.axis('off')

# Hide any unused subplots
for j in range(num_examples, num_examples):
    for c in range(num_cols):
        axs[j, c].axis('off')

plt.tight_layout()
plt.show()


### Balanced dataset

For each prefecture, we will take all the positive samples. To make the training dataset balanced across prefectures, we will select a proportional number of non-PV tiles relative to the total number of tiles.

In [None]:
# Exclude 'Augmented' from proportional calculation
counts_no_aug = counts_df[counts_df['county'] != 'Augmented']

total_tiles = counts_no_aug['total'].sum()
total_positive = counts_no_aug['with_solar'].sum()
total_negative = counts_no_aug['without_solar'].sum()

# Calculate proportion only for non-augmented counties
counts_df['proportion'] = counts_df.apply(
    lambda row: row['total'] / total_tiles if row['county'] != 'Augmented' else None, axis=1
)

# Calculate proportional columns for non-augmented, but for 'Augmented' just copy with_solar/without_solar
counts_df['proportional_pv'] = counts_df.apply(
    lambda row: int(round(total_positive * row['proportion'])) if row['county'] != 'Augmented' else row['with_solar'],
    axis=1
)

counts_df['proportional_nopv'] = counts_df.apply(
    lambda row: int(round(total_negative * row['proportion'])) if row['county'] != 'Augmented' else row['without_solar'],
    axis=1
)

# For nb_nopv: for each county, take the minimum between the available 'without_solar' panels and twice the proportional number of non-PV tiles (scaled by the overall PV/non-PV ratio). 
# This ensures that for each county, we do not select more non-PV tiles than are available, and we limit the number to avoid excessive class imbalance.
ratio = counts_df['with_solar'].sum() / counts_df['without_solar'].sum()
counts_df['nb_nopv'] = counts_df.apply(
    lambda row: (min(row['without_solar'], 2 * int(round(row['proportional_nopv'] * ratio)))) if row['county'] != 'Augmented' else 0,
    axis=1
)


# If the number of PV tiles is smaller than the proportional PV needed, mark True so the model knows to perform augmentation during training
counts_df['transformation'] = counts_df.apply(
    lambda row: (row['with_solar'] < row['proportional_pv']) if row['county'] != 'Augmented' else False,
    axis=1
)

counts_df

In [None]:
counts_df.sum()

In [None]:
counts_df.to_csv("tiles_proportion.csv", index=False)