In [1]:
DATADIR="/content/drive/MyDrive/solar_PV/data/" # replace with your data directory
SRCDIR="src/"

Import packages

In [None]:
import os
import pandas as pd

In [None]:
data = {"image_path": [], "mask_path": []}
save_path = os.path.join("src/", "dataset_csv")

Process China for segmentation

In [None]:
data_path = os.path.join(DATADIR, "China")

for folder in os.listdir(data_path):
  for file_name in os.listdir(os.path.join(data_path, folder)):
    if file_name.endswith("_label.bmp"):
      mask_path = os.path.join(data_path, folder, file_name)
      image_path = os.path.join(data_path, folder, file_name.replace("_label.bmp", ".bmp"))
      assert os.path.exists(image_path)
      data["image_path"].append(image_path)
      data["mask_path"].append(mask_path)
print("Collected {} image-mask pairs".format(len(data["image_path"])))

Collected 16259 image-mask pairs


Process Denmark for segmentation

In [None]:
data_path = os.path.join(DATADIR, "Denmark")

new_pair_count = 0
for folder in os.listdir(data_path):
  masks_dir = os.path.join(data_path, folder, "mask")
  for file_name in os.listdir(masks_dir):
    mask_path = os.path.join(masks_dir, file_name)
    image_path = mask_path.replace("mask", "positive")
    assert os.path.exists(image_path)
    data["image_path"].append(image_path)
    data["mask_path"].append(mask_path)
    new_pair_count += 1
print("Collected {} image-mask pairs".format(new_pair_count))

Collected 880 image-mask pairs


Process France for segmentation

In [None]:
from tqdm import tqdm

data_path = os.path.join(DATADIR, "France")

new_pair_count = 0
for folder in os.listdir(data_path):
  if not folder.endswith("_all"):
    continue
  masks_dir = os.path.join(data_path, folder, "mask")
  file_names = os.listdir(masks_dir)
  for file_name in tqdm(file_names):
    mask_path = os.path.join(masks_dir, file_name)
    image_path = mask_path.replace("mask", "img")
    assert os.path.exists(image_path)
    data["image_path"].append(image_path)
    data["mask_path"].append(mask_path)
    new_pair_count += 1
print("Collected {} image-mask pairs".format(new_pair_count))

100%|██████████| 13303/13303 [00:14<00:00, 906.87it/s]
100%|██████████| 7685/7685 [00:08<00:00, 894.02it/s] 

Collected 20988 image-mask pairs





Check the number of US large images

In [None]:
from PIL import Image
from tqdm import tqdm

data_path = os.path.join(DATADIR, "US")

new_pair_count = 0
for folder in os.listdir(data_path):
  if not folder.endswith("_all"):
    continue
  masks_dir = os.path.join(data_path, folder, "mask_patches_new_x4")
  large_files = os.listdir(os.path.join(data_path, folder))
  large_files = [file for file in large_files if file.endswith(".tif")]
  file_names = os.listdir(masks_dir)
  print("The number of large images is {} in {}, before tiling, it's {}".format(len(file_names), folder, len(large_files)))

The number of large images is 310 in Fresno_all, before tiling, it's 412
The number of large images is 18 in Modesto_all, before tiling, it's 20
The number of large images is 50 in Oxnard_all, before tiling, it's 75
The number of large images is 85 in Stockton_all, before tiling, it's 94


Truncate US large images into patches (new, 1*4)

In [None]:
import os
import json


from PIL import Image, ImageDraw
from tqdm import tqdm

SOURCE_DIR = r"/content/drive/MyDrive/solar_PV/data/US/Fresno_all"# replace with the dataset directory
SAVE_MASK_DIR = os.path.join(SOURCE_DIR, "mask_patches_x4/")
SAVE_ORIGINAL_DIR = os.path.join(SOURCE_DIR, "original_patches_x4/")
POLYGONPATH = r"/content/drive/MyDrive/solar_PV/data/US/Polygons/SolarArrayPolygons.geojson" # replace with the dataset directory


def patchify_one_image(img, target_size=128, save_dir=""):
    # get the size of the image
    width, height = img.size
    # extract patches
    for i in range(0, width, target_size):
        for j in range(0, height, target_size):
            box = (i, j, i + target_size, j + target_size)
            patch = img.crop(box).convert("RGB")
            # save the patch
            patch.save(os.path.join(save_dir, f"{i}_{j}.png"))

os.makedirs(SAVE_MASK_DIR, exist_ok=True)
os.makedirs(SAVE_ORIGINAL_DIR, exist_ok=True)

if __name__ == "__main__":
    # open the geojson file
    with open(POLYGONPATH) as f:
        polygons = json.load(f)

    image_to_polygons = {}
    for polygon in polygons["features"]:
        image_id = polygon["properties"]["image_name"]
        if image_id not in image_to_polygons:
            image_to_polygons[image_id] = []
        image_to_polygons[image_id].append(polygon)
    all_image_ids = os.listdir(SOURCE_DIR)
    all_image_ids = [image_id.replace(".tif", "") for image_id in all_image_ids if image_id.endswith(".tif")]
    for image_id in tqdm(all_image_ids, desc="Extracting masks"):
        print(image_id)
        if image_id + ".tif" in os.listdir(SOURCE_DIR):

            img = Image.open(os.path.join(SOURCE_DIR, image_id + ".tif"))
            width, height = img.size
            # generate the mask for the whole image
            mask = Image.new("L", (width, height), 0)
            if image_id in image_to_polygons:
              for polygon in image_to_polygons[image_id]:
                  coords = polygon["properties"]["polygon_vertices_pixels"]
                  coords = [(x, y) for x, y in coords]
                  ImageDraw.Draw(mask).polygon(coords, outline=255, fill=255)

              save_mask_dir = os.path.join(SAVE_MASK_DIR, image_id)
              os.makedirs(save_mask_dir, exist_ok=True)
              # process the mask
              patchify_one_image(mask, save_dir=save_mask_dir)
            save_original_dir = os.path.join(SAVE_ORIGINAL_DIR, image_id)
            os.makedirs(save_original_dir, exist_ok=True)
            # process the original image
            patchify_one_image(img, save_dir=save_original_dir)

Process US datasets for segmentation

In [None]:
from PIL import Image
from tqdm import tqdm
import glob
import os
from multiprocessing import Pool

data_path = os.path.join(DATADIR, "US")
pattern = os.path.join(data_path, "*_all", "mask_patches_new_x4", "*", "*.png")
mask_files = glob.glob(pattern)


def process_mask(mask_patch_path):
    image_patch_path = mask_patch_path.replace("mask", "original")
    if not os.path.exists(image_patch_path):
        return None
    # Check if the mask_patch is empty
    mask = Image.open(mask_patch_path)
    if mask.getbbox() is None:
        return None
    return (image_patch_path, mask_patch_path)

if __name__ == '__main__':
    with Pool() as pool:
        results = list(tqdm(pool.imap_unordered(process_mask, mask_files), total=len(mask_files)))

    new_pair_count = 0
    for result in results:
        if result is not None:
            image_patch_path, mask_patch_path = result
            data["image_path"].append(image_patch_path)
            data["mask_path"].append(mask_patch_path)
            new_pair_count += 1

    print("Collected {} image-mask pairs".format(new_pair_count))


100%|██████████| 733529/733529 [2:39:36<00:00, 76.60it/s]


Collected 15614 image-mask pairs


China + Denmark + France + US datasets

In [None]:
print("The total number of collected image-mask pairs is {}".format(len(data["image_path"])))

The total number of collected image-mask pairs is 0


Create fine tuning and validation dataset for segmentation

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Define test ratio
val_r = 0.1

# Load the dataset into a DataFrame
all_dataframe = pd.DataFrame(data)

# Split the dataset into train and test sets
train_dataframe, val_dataframe = train_test_split(all_dataframe, test_size=val_r, random_state=42)

# Save path
save_path = os.path.join("src/", "dataset_csv")
os.makedirs(save_path, exist_ok=True)

# Save train and test sets as CSV files
train_dataframe.to_csv(os.path.join(save_path, "train.csv"), index=False)
val_dataframe.to_csv(os.path.join(save_path, "val.csv"), index=False)

# Print the total number of train and test data
print("The total number of train data is {}".format(len(train_dataframe)))
print("The total number of validation data is {}".format(len(val_dataframe)))

The total number of train data is 34314
The total number of validation data is 3813


Make classification dataset

In [None]:
save_path = os.path.join("src/", "dataset_csv/classification")
os.makedirs(save_path, exist_ok=True)

All segmentation samples are positive samples for classification

In [None]:
positive_data_path = "src/dataset_csv/"
positive, negative = [], []
for file_name in ['train.csv', 'val.csv']:
  df = pd.read_csv(os.path.join(positive_data_path, file_name))
  image_paths = df['image_path'].tolist()
  for image_path in image_paths:
    positive.append(image_path)


Add US negativie samples for classsification

In [None]:
from PIL import Image
from tqdm import tqdm

data_path = os.path.join(DATADIR, "US")

new_pair_count = 0
for folder in os.listdir(data_path):
  if not folder.endswith("_all"):
    continue
  image_dir = os.path.join(data_path, folder, "original_patches_new_x4")
  file_names = os.listdir(image_dir)
  for file_name in tqdm(file_names):
    image_path = os.path.join(image_dir, file_name)
    for image_file in os.listdir(image_path):
      if image_file.endswith(".png"):
        image_patch_path = os.path.join(image_path, image_file)
        if image_patch_path not in positive:
          negative.append(image_patch_path)

100%|██████████| 410/410 [17:47<00:00,  2.60s/it]
100%|██████████| 20/20 [00:50<00:00,  2.54s/it]
100%|██████████| 75/75 [02:45<00:00,  2.21s/it]
100%|██████████| 94/94 [03:44<00:00,  2.39s/it]


Add Denmark negative samples for classification (China and France don't have negative sample)

In [None]:
# add denmark data
denmark_path = DATADIR + "/Denmark"
for file_name in os.listdir(denmark_path):
  image_dir = os.path.join(denmark_path, file_name, "negative")
  negative_image_paths = [os.path.join(image_dir, image_name) for image_name in os.listdir(image_dir)]
  negative.extend(negative_image_paths)
print("The number of positive data is {}".format(len(positive)))
print("The number of negative data is {}".format(len(negative)))

The number of positive data is 38127
The number of negative data is 951264


Create fine tuning and validation CSV for classification

In [None]:
all_classification = {
    "image_path": positive + negative,
    "label": [1] * len(positive) + [0] * len(negative)
}

os.makedirs(save_path, exist_ok=True)
all_classification_df = pd.DataFrame(all_classification)
all_classification_df.to_csv(os.path.join(save_path, "classification.csv"), index=False)

val_r = 0.1
from sklearn.model_selection import train_test_split

train_dataframe, val_dataframe = train_test_split(all_classification_df, test_size=val_r, random_state=42)

train_dataframe.to_csv(os.path.join(save_path, "train.csv"), index=False)
val_dataframe.to_csv(os.path.join(save_path, "val.csv"), index=False)

print("The total number of train data is {}".format(len(train_dataframe)))
print("The total number of val data is {}".format(len(val_dataframe)))

The total number of train data is 890451
The total number of val data is 98940
