In [3]:
from pathlib import Path
import rasterio
import numpy as np

# reading in geotiff file as numpy array
def read_tif(file: Path):

    if not file.exists():
        raise FileNotFoundError(f'File {file} not found')

    with rasterio.open(file) as dataset:
        arr = dataset.read()  # (bands X height X width)
        transform = dataset.transform
        crs = dataset.crs

    return arr.transpose((1, 2, 0)), transform, crs


# writing an array to a geo tiff file
def write_tif(file: Path, arr, transform, crs):

    if not file.parent.exists():
        file.parent.mkdir()

    height, width, bands = arr.shape
    with rasterio.open(
            file,
            'w',
            driver='GTiff',
            height=height,
            width=width,
            count=bands,
            dtype=arr.dtype,
            crs=crs,
            transform=transform,
    ) as dst:
        for i in range(bands):
            dst.write(arr[:, :, i], i + 1)


In [5]:
year = 2020

data_folder = Path('C:/Users/shafner/slum_extent_mapping/land_cover_classification_v3/data')
sentinel2_folder = data_folder / f'sentinel2_{year}'
labels_folder = data_folder /  f'labels_{year}'

def get_patch_ids(folder):
    files = [f for f in folder.glob('**/*')]
    ids = ['-'.join(f.stem.split('-')[-2:]) for f in files]
    return ids

patch_ids = get_patch_ids(labels_folder) 


features_prep = None
labels_prep = None
for patch_id in patch_ids:
    
    print(patch_id)
    
    s2_file = sentinel2_folder / f's2_Pan{year}-{patch_id}.tif'
    features_raw, _, _ = read_tif(s2_file)
    label_file = labels_folder / f'labels_Pan{year}-{patch_id}.tif'
    labels_raw, _, _ = read_tif(label_file)
    
    for label in range(1, 7):
        class_bool = np.squeeze(labels_raw == label)
        n_samples = np.sum(class_bool)
        
        if n_samples > 0:
            
            class_features = features_raw[class_bool, ]

            isnan = np.isnan(class_features)
            isvalid = np.sum(isnan, axis=1) == 0
            n_samples_valid = np.sum(isvalid)

            class_features = class_features[isvalid, ]

            class_labels = np.full((n_samples_valid, 1), fill_value=label-1)

            if features_prep is None:
                features_prep = class_features
                labels_prep = class_labels
            else:
                features_prep = np.concatenate((features_prep, class_features), axis=0)
                labels_prep = np.concatenate((labels_prep, class_labels), axis=0)
        
        
print(features_prep.shape, labels_prep.shape)
features_output_file = data_folder / f'features_Pan{year}.npy'
labels_output_file = data_folder / f'labels_Pan{year}.npy'

np.save(str(features_output_file), features_prep)
np.save(str(labels_output_file), labels_prep)



0000000000-0000000000
0000000000-0000001024
0000000000-0000002048
0000000000-0000003072
0000001024-0000000000
0000001024-0000001024
0000001024-0000002048
0000001024-0000003072
0000002048-0000000000
0000002048-0000001024
0000002048-0000002048
0000002048-0000003072
0000003072-0000000000
0000003072-0000001024
0000003072-0000002048
0000003072-0000003072
(89292, 76) (89292, 1)


In [7]:
year = 2020

data_folder = Path('C:/Users/shafner/slum_extent_mapping/land_cover_classification_v3/data')
sentinel2_folder = data_folder / f'sentinel2_{year}'
labels_folder = data_folder /  f'labels_{year}'

def get_patch_ids(folder):
    files = [f for f in folder.glob('**/*')]
    ids = ['-'.join(f.stem.split('-')[-2:]) for f in files]
    return ids

patch_ids = get_patch_ids(labels_folder) 


features_prep = None
labels_prep = None
for patch_id in patch_ids:
    
    print(patch_id)
    
    s2_file = sentinel2_folder / f's2_Pan{year}-{patch_id}.tif'
    features_raw, _, _ = read_tif(s2_file)
    label_file = labels_folder / f'labels_Pan{year}-{patch_id}.tif'
    labels_raw, _, _ = read_tif(label_file)
    
    for label in range(1, 7):
        class_bool = np.squeeze(labels_raw == label)
        n_samples = np.sum(class_bool)
        new_label = label -1
        
        if n_samples > 0 and new_label != 1:
            
            class_features = features_raw[class_bool, ]

            isnan = np.isnan(class_features)
            isvalid = np.sum(isnan, axis=1) == 0
            n_samples_valid = np.sum(isvalid)

            class_features = class_features[isvalid, ]

            class_labels = np.full((n_samples_valid, 1), fill_value=new_label)

            if features_prep is None:
                features_prep = class_features
                labels_prep = class_labels
            else:
                features_prep = np.concatenate((features_prep, class_features), axis=0)
                labels_prep = np.concatenate((labels_prep, class_labels), axis=0)
        
        
print(features_prep.shape, labels_prep.shape)
features_output_file = data_folder / f'features_Pan{year}_nobuilding.npy'
labels_output_file = data_folder / f'labels_Pan{year}_nobuilding.npy'

np.save(str(features_output_file), features_prep)
np.save(str(labels_output_file), labels_prep)



0000000000-0000000000
0000000000-0000001024
0000000000-0000002048
0000000000-0000003072
0000001024-0000000000
0000001024-0000001024
0000001024-0000002048
0000001024-0000003072
0000002048-0000000000
0000002048-0000001024
0000002048-0000002048
0000002048-0000003072
0000003072-0000000000
0000003072-0000001024
0000003072-0000002048
0000003072-0000003072
(67617, 76) (67617, 1)
