# Image Processing
Découpage des images Sentinel2, Attribution des labels au images, Enregistrement des images sur le disque, Calcul de la moyenne et écart-types des images

* Romain Capocasale
* IADeforestation
* HES-SO MASTER

# Import

In [2]:
import rasterio
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import shapefile
import affine
import statistics
import geopandas as gpd
import spacv
import pandas as pd

from sklearn.model_selection import train_test_split
from pyproj import Transformer
from shapely.geometry.point import Point
from shapely import wkt
from sklearn.model_selection import StratifiedShuffleSplit

from IAdeforestation.preprocessing import *
from IAdeforestation.tools import *



# Load Shapefile

In [2]:
LABELS_PATH = 'labels'
SHAPEFILES_PATHS = [os.path.join(LABELS_PATH, 'central_highlands_1_other', 'central_highlands_1_other.shp'),
 os.path.join(LABELS_PATH, 'central_highlands_2_test', 'central_highlands_2_test.shp'),
 os.path.join(LABELS_PATH, 'central_highlands_2_other', 'central_highlands_2_other.shp')]

SHAPEFILE_ESPG=4326

points = process_shapefile(SHAPEFILES_PATHS)

# Create output image tree

In [3]:
OUTPUT_DIR = 'spring_images_32'
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
    for i in range(-1,33):
        os.mkdir(os.path.join(OUTPUT_DIR, str(i)))

# Get all Sentinel images

In [4]:
SENTINEL_IMAGES_PATH = 'SentinelImages'
paths = os.listdir(SENTINEL_IMAGES_PATH)

In [5]:
paths

['a', 'b', 'c', 'd', 'e']

# Split and export images on disk

In [9]:
geo_paths = []

TARGET_IMG_SIZE = 32 # Image size (width*height)
NUMBER_OF_SPLIT = 343 # Number of split in big image, causion large image size > that TARGET_IMG_SIZE * NUMBER_OF_SPLIT
i = 0

for sentinel_image_path in paths:
    raster_paths = get_raster_paths(os.path.join(SENTINEL_IMAGES_PATH, sentinel_image_path)) # Get each raster path
    raster_dict = load_raster_img(raster_paths) # Load each raster
    image_dict = resample_bands(raster_dict)

    l = list(image_dict.values())
    final_img = np.asarray(l)
    split_export_img(final_img, 
                     raster_dict['B02'], 
                     points, 
                     img_prefix=str(i), 
                     crop_size=TARGET_IMG_SIZE,
                     split_size=NUMBER_OF_SPLIT,
                     export_folder=OUTPUT_DIR,
                    geo_paths=geo_paths)
    print(f"{sentinel_image_path} ready")
    i += 1

a ready
b ready
c ready
d ready
e ready


In [8]:
SAVED_DATASET_PATH = "datasets"

## Save datasets on disk

In [None]:
gpd.GeoDataFrame(pd.DataFrame(geo_paths, columns=['path', 'label', 'geometry'])).to_csv(os.path.join('datasets', 'start_all.csv'))

# Train / test split stratified non-spatial dataset

In [6]:
all_data = gpd.GeoDataFrame(pd.read_csv(os.path.join('datasets', 'start_all.csv')))
all_data['geometry'] = all_data['geometry'].apply(wkt.loads)

all_data.loc[all_data['label'] != 2, 'label'] = 1
all_data.loc[all_data['label'] == 2, 'label'] = 0


sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index, test_index in sss.split(all_data['path'].to_numpy(), all_data['label']):
    train_set = all_data.iloc[train_index]
    test_set = all_data.iloc[test_index]
 

train_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_train.csv"), index=False)
test_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_test.csv"), index=False)

In [38]:
culture_list = [1,2,6,15,18,19,17] #  6 caoutchou, 25 poivre de cayenne, 16 cassava, 23 intercrop
no_culture_list = [4,9,10, 24,27] # 11 other tree, 27 pines tree

## Coffee vs other
### 64x64

In [10]:
all_data = gpd.GeoDataFrame(pd.read_csv(os.path.join('datasets', 'strat_all.csv')))
all_data['geometry'] = all_data['geometry'].apply(wkt.loads)

all_data.loc[all_data['label'] != 2, 'label'] = 0
all_data.loc[all_data['label'] == 2, 'label'] = 1

sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index, test_index in sss.split(all_data['path'].to_numpy(), all_data['label']):
    train_set = all_data.iloc[train_index]
    test_set = all_data.iloc[test_index]
    
train_set.to_csv(os.path.join(SAVED_DATASET_PATH, "10strat_train.csv"), index=False)
test_set.to_csv(os.path.join(SAVED_DATASET_PATH, "10strat_test.csv"), index=False)

### 32x32

In [None]:
all_data = gpd.GeoDataFrame(pd.read_csv(os.path.join('datasets', 'strat_all_32.csv')))
all_data['geometry'] = all_data['geometry'].apply(wkt.loads)

all_data = all_data[all_data['label'].isin(culture_list + no_culture_list)]
all_data.loc[train_data['label'] != 2, 'label'] = 1
all_data.loc[train_data['label'] == 2, 'label'] = 0

sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index, test_index in sss.split(all_data['path'].to_numpy(), all_data['label']):
    train_set = all_data.iloc[train_index]
    test_set = all_data.iloc[test_index]
    
train_set.to_csv(os.path.join(SAVED_DATASET_PATH, "start_train_32.csv"), index=False)
test_set.to_csv(os.path.join(SAVED_DATASET_PATH, "start_test_32.csv"), index=False)

## Culture vs no-culture
### 64x64

In [56]:
all_data = gpd.GeoDataFrame(pd.read_csv(os.path.join('datasets', 'start_all.csv')))
all_data['geometry'] = all_data['geometry'].apply(wkt.loads)

all_data = all_data[all_data['label'].isin(culture_list + no_culture_list)]
all_data.loc[all_data['label'].isin(culture_list), 'label'] = 0
all_data.loc[all_data['label'].isin(no_culture_list), 'label'] =1

sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index, test_index in sss.split(all_data['path'].to_numpy(), all_data['label']):
    train_set = all_data.iloc[train_index]
    test_set = all_data.iloc[test_index]
    
train_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_train_culture.csv"), index=False)
test_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_test_culture.csv"), index=False)

### 32x32

In [59]:
all_data = gpd.GeoDataFrame(pd.read_csv(os.path.join('datasets', 'start_all.csv')))
all_data['geometry'] = all_data['geometry'].apply(wkt.loads)

all_data = all_data[all_data['label'].isin(culture_list + no_culture_list)]
all_data.loc[all_data['label'].isin(culture_list), 'label'] = 0
all_data.loc[all_data['label'].isin(no_culture_list), 'label'] =1

sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index, test_index in sss.split(all_data['path'].to_numpy(), all_data['label']):
    train_set = all_data.iloc[train_index]
    test_set = all_data.iloc[test_index]
  
train_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_train_culture_32.csv"), index=False)
test_set.to_csv(os.path.join(SAVED_DATASET_PATH, "strat_train_culture_32.csv"), index=False)

## Compute Mean/Std of images dataset
The mean and standard deviation obtained following the execution of this code are used to normalize the images with the z-norm method.

In [28]:
SENTINEL_IMAGES_PATH = 'SentinelImages'

paths = os.listdir(SENTINEL_IMAGES_PATH)
paths

['a', 'b', 'c', 'd', 'e']

In [29]:
NB_SELECTED_VAL = 100000
dict_val = dict.fromkeys(['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'B8A'])

for band in dict_val.keys():
    dict_val[band] = np.array([])
    
for path in paths:
    raster_paths = get_raster_paths(os.path.join(SENTINEL_IMAGES_PATH, path)) # Get each raster path
    raster_dict = load_raster_img(raster_paths) # Load each raster
    image_dict = resample_bands(raster_dict)

    l = list(image_dict.values())
    final_img = np.asarray(l)

    for i, band_name in enumerate(dict_val.keys()):
        dict_val[band_name] = np.append(dict_val[band_name], 
                                        np.random.choice(final_img[i].flatten(),NB_SELECTED_VAL, replace=False))
    print(path)

a
b
c
d
e


In [30]:
dict_mean = dict.fromkeys(dict_val.keys())
dict_std = dict.fromkeys(dict_val.keys())

for band in dict_val.keys():
    dict_mean[band] = statistics.mean(dict_val[band])
    dict_std[band] = statistics.stdev(dict_val[band])

In [34]:
dict_mean.values()

dict_values([1279.534254, 1016.734146, 925.27579, 793.929164, 1073.835362, 1909.174038, 2299.416608, 2270.341238, 739.972412, 14.35029, 1872.530084, 1055.580112, 2581.31964])

In [35]:
dict_std.values()

dict_values([217.06847657849937, 236.49447129038893, 254.3062726895201, 383.2039520109347, 368.1552150776269, 508.3797488499248, 648.9503962237852, 672.6241209212196, 250.63210405653427, 9.4263874644246, 805.0923719290897, 632.9663115986274, 746.86130213707])