# Train Pixel Level Annotation Model

### This notebook uses pixel level annotations to train a Random Forest Classifier to predict labels

We assume Pixel level annotations are available, as produced by the "../data/annotations/transform_polygon_annotations_to_pixels.ipynb" notebook. Feature selection and grid_search for optimal parameters has been done in a separate notebook ("Coepelduynen/make_train_model_on_annotations_coepelduynen.ipynb") and those outcomes are taken as given in this notebook.

Change the set Variables cell below as desired and then run the entire notebook to get cross_validation results as well as a final model trained on all data.

Date: 2024-01-12\
Author: Pieter Kouyzer, Michael de Winter

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
%matplotlib notebook
import pandas as pd
import geopandas as gpd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pprint
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from training.train import train_imbalanced_model, cross_validation_balance_on_date
from training.utils import get_cross_validation_results_filepath, get_model_filepath
from training.metric_calculation import calculate_average_metrics, get_metrics
from sklearn.metrics import f1_score
from tif_model_iterator import tif_kernel_iterator
from filenames.file_name_generator import OutputFileNameGenerator
import glob
import contextlib
import io
import numpy as np
import requests

data_dir = os.path.abspath(os.path.join(os.getcwd(), '..')).replace("\\","/")+"/data/annotations/annotations_pixel_dataframes/"


def download_file(url, local_path):

    # Send a GET request to the URL to download the zip file
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Write the content of the response (the zip file) to a local file
        with open(local_path, "wb") as file:
            file.write(response.content)

        print(f"file has been downloaded to {local_path}")
    else:
        print("Failed to download file:", response.status_code)



In [2]:
data_dir

'c:/repos/satellite-images-nso-datascience/data/annotations/annotations_pixel_dataframes/'

In [2]:
# Set Variables
location = "Nieuwkoopse_plassen"
satellite_constellation = "Superview"

if location == "Voornes Duin":

    if satellite_constellation == "PNEO":
        # Check if file exists
        if not os.path.isfile(data_dir+"Voornes_Duin_PNEO_2024-01-29_pixel_annotations.parquet"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/Voornes_Duin/annotations_pixel_dataframes/2023_PNEO/Voornes_Duin_PNEO_2024-01-29_pixel_annotations.parquet", data_dir+"Voornes_Duin_PNEO_2024-01-29_pixel_annotations.parquet")

        annotated_pixels_filepath = data_dir+"Voornes_Duin_PNEO_2024-01-29_pixel_annotations.parquet"

    #TODO: Missing Superview here!


elif location == "Coepelduynen":
    if satellite_constellation == "PNEO":
        # Check if file exists
        if not os.path.isfile(data_dir+"annotaties_coepelduynen_to_pixel_2023.parquet"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/coepelduynen/annotations_pixel_dataframes/annotations_2023_PNEO/annotaties_coepelduynen_to_pixel_2023.parquet", data_dir+"annotaties_coepelduynen_to_pixel_2023.parquet")

        annotated_pixels_filepath = data_dir+"annotaties_coepelduynen_to_pixel_2023.parquet"
        df = pd.read_parquet(annotated_pixels_filepath)

    if satellite_constellation == "Superview":
        # Check if file exists
        if not os.path.isfile(data_dir+"annotaties_coepelduynen_to_pixel.csv"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/coepelduynen/annotations_pixel_dataframes/annotations_2019_2022_Superview/annotaties_coepelduynen_to_pixel.csv", data_dir+"annotaties_coepelduynen_to_pixel.csv")

        annotated_pixels_filepath = data_dir+"annotaties_coepelduynen_to_pixel.csv"
        df = pd.read_parquet(annotated_pixels_filepath)

   
elif location == "Schippersgat":
    if satellite_constellation == "PNEO":
        if not os.path.isfile(data_dir+"PNEO_waterplanten_annotations.parquet"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/Schippersgat/PNEO_waterplanten_annotations.parquet", data_dir+"PNEO_waterplanten_annotations.parquet")
        annotated_pixels_filepath = data_dir+"PNEO_waterplanten_annotations_schippersgat.parquet"
        df = pd.read_parquet(annotated_pixels_filepath)

    if satellite_constellation == "Superview":
        if not os.path.isfile(data_dir+"Superview_waterplanten_annotations.parquet"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/Schippersgat/Superview_waterplanten_annotations.parquet", data_dir+"Superview_waterplanten_annotations.parquet")
        annotated_pixels_filepath = data_dir+"Superview_waterplanten_annotations.parquet"
        df = pd.read_parquet(annotated_pixels_filepath)

elif location == "Nieuwkoopse_plassen":
    if satellite_constellation == "PNEO":
        if not os.path.isfile(data_dir+"PNEO_waterplanten_annotations.parquet"):
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/Schippersgat/PNEO_waterplanten_annotations.parquet", data_dir+"PNEO_waterplanten_annotations.parquet")
        annotated_pixels_filepath = data_dir+"PNEO_waterplanten_annotations_schippersgat.parquet"
        df = pd.read_parquet(annotated_pixels_filepath)


        df = pd.concat([df, pd.read_parquet(data_dir+"PNEO_waterplanten_annotations_Nieuwkoopse_Plassen.parquet")])

    if satellite_constellation == "Superview":
        
        if not os.path.isfile(data_dir+"Superview_waterplanten_annotations.parquet"):
            print("Downloading file")
            download_file("https://e34a505986aa74678a5a0e0f.blob.core.windows.net/satellite-images-nso/Nieuwkoopse_Plassen/Superview_waterplanten_annotations.parquet", data_dir+"Superview_waterplanten_annotations.parquet")
        annotated_pixels_filepath = data_dir+"Superview_waterplanten_annotations.parquet"
        df = pd.read_parquet(annotated_pixels_filepath)

        #TODO: VARY STRANGE REASON THAT NDWI AND NDVI GET FLIPPED WHILE READING IN DATA!!!!!!!!!!!!!!!!!!!!
        df['temp_ndwi'] = df['ndvi']
        df['ndvi'] =  df['ndwi']
        df['ndwi'] = df['temp_ndwi']
        df = df.drop(['temp_ndwi'], axis=1)


In [6]:
# Optimal parameters and features Voornes
if location == "Voornes Duin":
    selected_features = ['r', 'g', 'b', 'n', 'e', 'd', 'ndvi','re_ndvi', 'height']
    optimal_parameters = {
        "n_estimators": 10, 
        "min_samples_split": 5, 
        "min_samples_leaf": 1,
        "max_features": "auto", 
        "bootstrap": False
    }

elif location == "Coepelduynen":
    # Optimal parameters and features
    selected_features = ['r','g','b','n','e','d','ndvi','re_ndvi']
    optimal_parameters = {
        "n_estimators": 10, 
        "min_samples_split": 5, 
        "min_samples_leaf": 1,
        "bootstrap": False
    }

elif location == "Schippersgat" and satellite_constellation == "Superview":
    # Optimal parameters and features
    selected_features = ['r', 'g', 'b', 'i', 'ndvi', 'height']
    optimal_parameters = {
        "n_estimators": 10, 
        "min_samples_split": 5, 
        "min_samples_leaf": 1,
        "bootstrap": False
    }

elif location == "Schippersgat" and satellite_constellation == "PNEO":
    # Optimal parameters and features
    selected_features = ['r', 'g', 'b', 'n', 'e', 'd','ndvi', 're_ndvi']
    optimal_parameters = {
        "n_estimators": 10, 
        "min_samples_split": 5, 
        "min_samples_leaf": 1,
        "bootstrap": False
    }


elif location == "Nieuwkoopse_plassen" and satellite_constellation == "PNEO":
    # Optimal parameters and features
    selected_features = ['r', 'g', 'b', 'n', 'e', 'd','ndvi', 're_ndvi']
    optimal_parameters = {
        "n_estimators": 10, 
        "min_samples_split": 5, 
        "min_samples_leaf": 1,
        "bootstrap": False
    }

elif location == "Nieuwkoopse_plassen" and satellite_constellation == "Superview":
    # Optimal parameters and features
    selected_features = ['r', 'g', 'b', 'i', 'ndvi', 'ndwi']
    optimal_parameters = {
        "n_estimators": 40, 
        "min_samples_split": 15, 
        "min_samples_leaf": 5,
        "bootstrap": True
    }

In [5]:
df['label'] = df['label'].str.replace("GRound","Ground" )

In [6]:
# This is to give an indication of the amount of data points per label
df['label'].value_counts()

label
Water          3814785
Ground         2542204
Waterplants     235613
Name: count, dtype: int64

In [20]:
# Downsample Water if necessary
#df = pd.concat([df[df["label"] == "Water"].sample(1719330, random_state=1), df[df["label"] == "Ground"], df[df["label"] == "Waterplants"]])

### Cross Validation

We do cross-validation, where the folds are decided by the 'date' column. This is to avoid pixels from the same image from ending up in both the train and test datasets. We display the metrics averaged over the folds and write the results to a pickle.

In [7]:
model = RandomForestClassifier(**optimal_parameters)
scaler = StandardScaler()

In [8]:
if location == "Voornes Duin":
    cv =5 
    random_state = 1337
    sampling_type_boundary = 100000
elif location == "Coepelduynen":
    cv =4 
    random_state = 1337
    sampling_type_boundary = 898609
elif location == "Schippersgat":
    cv =2
    random_state = 1337
    sampling_type_boundary = 898609

elif location == "Nieuwkoopse_plassen":
    cv = 7
    random_state = 1337
    sampling_type_boundary = 898609

In [9]:
df['date'] = df['date'].str.split('_').str[0]

In [10]:
results = cross_validation_balance_on_date(data=df, model=model, cv=cv, features=selected_features, random_state=random_state, sampling_type_boundary=sampling_type_boundary , scaler=scaler)

---------fold: 1
Picked hold out dates: 
['20190302', '20190629']
Oversampling to rebalance dataset


2024/06/25 12:17:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a94fa82273474862a5b5e540e8326f03', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 12:18:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '35bc2809a2e3408ab190eda484de08bc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.9858749205340115, 'recall': 0.7865622672013441, 'f1-score': 0.8750121223341886, 'support': 126182}, 'Water': {'precision': 0.9682926521602019, 'recall': 0.9856621895785717, 'f1-score': 0.9769002182586397, 'support': 389390}, 'Waterplants': {'precision': 0.6962643316114401, 'recall': 0.9830779835894243, 'f1-score': 0.815178810145021, 'support': 44971}}
---------fold: 2
Picked hold out dates: 
['20220318', '20220705']
Oversampling to rebalance dataset


2024/06/25 12:48:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0e1752aed0e840c8985684463abd73a9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 12:49:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '591ba1aa4cca4e8884c898dc826f62f6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.9422789415342637, 'recall': 0.9789170323213116, 'f1-score': 0.9602486343071317, 'support': 169331}, 'Water': {'precision': 0.9538601260961871, 'recall': 0.9999173462640512, 'f1-score': 0.9763458730445778, 'support': 326664}, 'Waterplants': {'precision': 0.9955766710353866, 'recall': 0.21352025578862302, 'f1-score': 0.3516273687255894, 'support': 28461}}
---------fold: 3
Picked hold out dates: 
['20220803', '20190826']
Oversampling to rebalance dataset


2024/06/25 13:21:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8c4ef09b215f4311acf456813ee7444e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 13:21:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e2473612ef154b868202e2260d072263', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.9273441522530941, 'recall': 0.7022383133226059, 'f1-score': 0.7992434959316188, 'support': 484740}, 'Water': {'precision': 0.8765671065142739, 'recall': 0.9997317459731113, 'f1-score': 0.9341070189286437, 'support': 1136982}, 'Waterplants': {'precision': 0.5849358974358975, 'recall': 0.03274717387403553, 'f1-score': 0.06202209005947323, 'support': 44584}}
---------fold: 4
Picked hold out dates: 
['20221012', '20210423']
Oversampling to rebalance dataset


2024/06/25 13:43:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '50272e94bf6e4668a7f2ddc76ae04753', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 13:44:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3b7c92785aa44879b2a1149d91a8d29f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.9560874997684266, 'recall': 0.9456000162868108, 'f1-score': 0.9508148396830303, 'support': 491195}, 'Water': {'precision': 0.9490166172617803, 'recall': 0.9854021124152288, 'f1-score': 0.9668671679197997, 'support': 704691}, 'Waterplants': {'precision': 0.042767295597484274, 'recall': 0.0015161649944258639, 'f1-score': 0.0029285099052540915, 'support': 22425}}
---------fold: 5
Picked hold out dates: 
['20200713', '20210617']
Oversampling to rebalance dataset


2024/06/25 14:11:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '38c75bbfe61741e69763e55598ba8950', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 14:12:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1a387c4a00a9499f9c01b871c0290cc0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.9937102775006744, 'recall': 0.9075591489654303, 'f1-score': 0.9486828549523216, 'support': 349034}, 'Water': {'precision': 0.9910944918694187, 'recall': 0.9970055434567412, 'f1-score': 0.9940412302453828, 'support': 444488}, 'Waterplants': {'precision': 0.5899994215975476, 'recall': 0.9820685007341083, 'f1-score': 0.7371435281790014, 'support': 41547}}
---------fold: 6
Picked hold out dates: 
['20210614', '20220811']
Oversampling to rebalance dataset


2024/06/25 14:42:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7f4e1314bc9c40c9870b8a39dc7b6c60', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 14:43:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '022f34f63eec461f90b273cfc2c52e2b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics
{'Ground': {'precision': 0.7344342386738303, 'recall': 0.990044921577585, 'f1-score': 0.8432956512222705, 'support': 735504}, 'Water': {'precision': 0.9774280890094182, 'recall': 0.4281749822653109, 'f1-score': 0.5954885319049298, 'support': 528625}, 'Waterplants': {'precision': 0.18435414545865852, 'recall': 0.4396601139697205, 'f1-score': 0.25977984758679085, 'support': 29657}}
---------fold: 7
Picked hold out dates: 
['20190416', '20190409']
Oversampling to rebalance dataset


2024/06/25 15:13:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '14cf8fc5ca1f415692d5c9ff3b61ed57', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/25 15:14:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd452758f07604b4e81c819bac85598f6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

Calculating train metrics
Calculating test metrics


  recall = confusion_matrix[i, i] / confusion_matrix[i, :].sum()


{'Ground': {'precision': 0.9960133744855967, 'recall': 0.32928021767781984, 'f1-score': 0.49493561683228426, 'support': 23521}, 'Water': {'precision': 0.8912287801811047, 'recall': 0.9997486887226091, 'f1-score': 0.942374850034005, 'support': 123353}, 'Waterplants': {'precision': 0.0, 'recall': nan, 'f1-score': nan, 'support': 0}}


In [11]:
calculate_average_metrics(results=results)

Unnamed: 0,precision,recall,f1-score
Ground,0.888229,0.892027,0.878536
Water,0.936239,0.912472,0.907345
Waterplants,0.551228,0.499053,0.414979


In [75]:
cross_validation_results_filepath = get_cross_validation_results_filepath(location=location, satellite_constellation=satellite_constellation, df=df)
print(f"Saving to {cross_validation_results_filepath}")
with open(cross_validation_results_filepath, "wb") as file:
    pickle.dump(results, file)

Saving to ../saved_models/Superview_Nieuwkoopse_plassen_20190629_to_20221012_cross_validation_results.pkl


## Cross validation randomised

In [13]:
# Initialize KFold
kf = KFold(n_splits=4, shuffle=True, random_state=42)

scores = cross_val_score(model, df[selected_features], df["label"], cv=kf)

2024/06/19 16:17:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '84ba19211c724bb09499c97fb4262da7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


<IPython.core.display.Javascript object>

2024/06/19 16:24:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '75db5e3b677c4ac284950c28aab87e03', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


<IPython.core.display.Javascript object>

2024/06/19 16:32:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '60b74d828b764dab85a3e5bd0f6fa6c3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


<IPython.core.display.Javascript object>

2024/06/19 16:39:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6a7b8a2bcf8e44e7a6f1b6fb8bdfc166', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


<IPython.core.display.Javascript object>

In [14]:
scores

array([0.9962408 , 0.996042  , 0.99603984, 0.99614319])

### Try some unbalanced models

In [None]:
unbalanced_model = RandomForestClassifier(**optimal_parameters)
unblanced_final_scaler = StandardScaler()

In [None]:
unbalanced_model.fit(unblanced_final_scaler.fit_transform(df[selected_features]), df['label'])

In [None]:
f1_score(unbalanced_model.predict(unblanced_final_scaler.transform(df[selected_features])), df["label"], average=None)

In [None]:
final_artefact = {
    "model": unbalanced_model,
    "scaler": unblanced_final_scaler
}

### Export Definitive model.

Trains a Random Forest Classifier model on all data and writes it to a pickle file for later use. This is the definitive model output by this notebook.

In [10]:
final_model = RandomForestClassifier(**optimal_parameters)
final_scaler = StandardScaler()

train_imbalanced_model(
    X_train=df[selected_features], 
    y_train=df["label"], 
    model=final_model, 
    random_state=1337, 
    sampling_type_boundary=sampling_type_boundary ,
    scaler=final_scaler
)
pprint.pprint(get_metrics(y=df["label"], X=df[selected_features], model=final_model, scaler=final_scaler))

Oversampling to rebalance dataset


2024/06/20 11:54:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4830b3c07fdc4943849083e8c88deea3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/06/20 11:55:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '40e2653d82e14b6f807c16140b4eb62e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting model




<IPython.core.display.Javascript object>

{'Ground': {'f1-score': 0.9964152263780806,
            'precision': 0.9964399204115688,
            'recall': 0.9963905335685098,
            'support': 2542204},
 'Water': {'f1-score': 0.9980307194135087,
           'precision': 0.9984138968450557,
           'recall': 0.9976478359855142,
           'support': 3814785},
 'Waterplants': {'f1-score': 0.9926857398282446,
                 'precision': 0.9863365540127207,
                 'recall': 0.9991171964195524,
                 'support': 235613}}


In [28]:
final_artefact = {
    "model": final_model,
    "scaler": final_scaler
}

In [27]:
final_model_filepath = get_model_filepath(location=location, satellite_constellation=satellite_constellation, df=df)
print(f"Saving to {final_model_filepath}")
with open(final_model_filepath, "wb") as file:
    pickle.dump(final_artefact, file)

Saving to ../saved_models/Superview_Nieuwkoopse_plassen_20190302_113613_to_20221012_104900_random_forest_classifier.sav


# Model tests

In [15]:
loaded_final_artefact = pickle.load(open("C:/repos/satellite-images-nso-datascience/saved_models/Superview_Nieuwkoopse_plassen_20190302_113613_to_20221012_104900_random_forest_classifier.sav", "rb"))

In [25]:
# Difficult waterpixels to predict
assert loaded_final_artefact["model"].predict(loaded_final_artefact['scaler'].transform(df[(df['rd_x'] == 117582.75) & (df['rd_y'] ==  462877.75)][selected_features]))[0] == "Water"
assert loaded_final_artefact["model"].predict(loaded_final_artefact['scaler'].transform(df[(df['rd_x'] == 114123.25) & (df['rd_y'] ==  462074.25)][selected_features]))[0] == "Water"

In [32]:
(final_artefact["model"].predict(final_artefact['scaler'].transform(df[selected_features]))  == df['label']).value_counts()

label
True     5357374
False     216144
Name: count, dtype: int64

In [59]:
df['predict_label'] = final_artefact["model"].predict(final_artefact['scaler'].transform(df[selected_features])) 

In [61]:
f1_score(df['label'], df['predict_label'], average=None)

array([0.99700754, 0.99786617, 0.99201459])

In [34]:
test_tif_files_dir = "E:/output/test/Nieuwkoopse_plassen/*SV*.tif"

In [35]:
for a_tif_file in glob.glob(test_tif_files_dir):
    a_tif_file = a_tif_file.replace("\\", "/")
    print(a_tif_file)

    with contextlib.redirect_stdout(io.StringIO()):
        output_file_name_generator = OutputFileNameGenerator(
                        output_path="E:/output/test/Nieuwkoopse_plassen/",
                        output_file_name="E:/output/test/Nieuwkoopse_plassen/"
                        + a_tif_file.split("/")[-1].replace(".tif", ".parquet"),
                    )


        nso_tif_kernel_iterator_generator = (
                        tif_kernel_iterator.TifKernelIteratorGenerator(
                            path_to_tif_file=a_tif_file,
                            model=final_artefact["model"],
                            output_file_name_generator=output_file_name_generator,
                            parts=1,
                            normalize_scaler=final_artefact["scaler"],
                            column_names= selected_features,
                            dissolve_parts=False,
                            square_output=False,
                            skip_done_part=False,
                        )
                    )

        nso_tif_kernel_iterator_generator.predict_all_output()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 19.00it/s]
100%|██████████| 1/1 [00:00<00:00, 14.45it/s]

E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif



100%|██████████| 1/1 [00:00<00:00, 33.33it/s]
100%|██████████| 1/1 [00:00<00:00, 11.62it/s]
100%|██████████| 1/1 [00:00<00:00, 36.37it/s]


E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20190416_113410_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20190416_113410_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif


100%|██████████| 1/1 [00:00<00:00, 20.40it/s]
100%|██████████| 1/1 [00:00<00:00, 30.66it/s]
100%|██████████| 1/1 [00:00<00:00, 38.46it/s]
100%|██████████| 1/1 [00:00<00:00, 35.59it/s]


E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif


100%|██████████| 1/1 [00:00<00:00, 28.82it/s]
100%|██████████| 1/1 [00:00<00:00, 28.09it/s]
100%|██████████| 1/1 [00:00<00:00, 13.93it/s]


E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20191003_113732_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif


100%|██████████| 1/1 [00:00<00:00, 28.47it/s]
100%|██████████| 1/1 [00:00<00:00, 32.18it/s]
100%|██████████| 1/1 [00:00<00:00, 26.96it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

E:/output/test/Nieuwkoopse_plassen/20191003_113732_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20191003_113732_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20210423_111111_SV1-04_SV_RD_11bit_RGBI_50cm_Nieuwkoop_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif


100%|██████████| 1/1 [00:00<00:00, 16.65it/s]
100%|██████████| 1/1 [00:00<00:00, 12.15it/s]
100%|██████████| 1/1 [00:00<00:00, 27.36it/s]

E:/output/test/Nieuwkoopse_plassen/20210423_111111_SV1-04_SV_RD_11bit_RGBI_50cm_Nieuwkoop_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20210617_111532_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif



100%|██████████| 1/1 [00:00<00:00, 28.92it/s]
100%|██████████| 1/1 [00:00<00:00, 12.08it/s]


E:/output/test/Nieuwkoopse_plassen/20210617_111532_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20210617_111532_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20220318_114608_SV1-02_SV_RD_11bit_RGBI_50cm_Bodegraven_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif


100%|██████████| 1/1 [00:00<00:00, 23.26it/s]


E:/output/test/Nieuwkoopse_plassen/20220318_114608_SV1-02_SV_RD_11bit_RGBI_50cm_Bodegraven_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif


100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00, 32.24it/s]
100%|██████████| 1/1 [00:00<00:00, 34.65it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

E:/output/test/Nieuwkoopse_plassen/20220705_110002_SV2-01_SV_RD_11bit_RGBI_50cm_Woerden_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20220705_110002_SV2-01_SV_RD_11bit_RGBI_50cm_Woerden_Schippersgat_cropped_ndwi_re_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20220705_110002_SV2-01_SV_RD_11bit_RGBI_50cm_Woerden_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif


100%|██████████| 1/1 [00:00<00:00, 11.34it/s]
100%|██████████| 1/1 [00:00<00:00, 28.58it/s]
100%|██████████| 1/1 [00:00<00:00, 33.44it/s]
100%|██████████| 1/1 [00:00<00:00, 24.41it/s]


E:/output/test/Nieuwkoopse_plassen/20220803_105908_SV2-01_SV_RD_11bit_RGBI_50cm_Boskoop_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20220803_105908_SV2-01_SV_RD_11bit_RGBI_50cm_Boskoop_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20220803_105908_SV2-01_SV_RD_11bit_RGBI_50cm_Boskoop_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20220811_110734_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif


100%|██████████| 1/1 [00:00<00:00, 25.65it/s]
100%|██████████| 1/1 [00:00<00:00, 37.57it/s]
100%|██████████| 1/1 [00:00<00:00, 23.22it/s]
100%|██████████| 1/1 [00:00<00:00, 30.13it/s]

E:/output/test/Nieuwkoopse_plassen/20220811_110734_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Waterplants_test.tif
E:/output/test/Nieuwkoopse_plassen/20220811_110734_SV1-04_SV_RD_11bit_RGBI_50cm_Mijdrecht_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif
E:/output/test/Nieuwkoopse_plassen/20221012_104900_SV2-01_SV_RD_11bit_RGBI_50cm_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Ground_test.tif
E:/output/test/Nieuwkoopse_plassen/20221012_104900_SV2-01_SV_RD_11bit_RGBI_50cm_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Waterplants_test.tif



100%|██████████| 1/1 [00:00<00:00, 26.84it/s]
100%|██████████| 1/1 [00:00<00:00, 25.08it/s]

E:/output/test/Nieuwkoopse_plassen/20221012_104900_SV2-01_SV_RD_11bit_RGBI_50cm_Nieuwveen_Nieuwkoopse_Plassen_De_Haeck_cropped_ndwi_ndvi_Water_test.tif





In [24]:
falses = 0
for afile in glob.glob("E:/output/test/Nieuwkoopse_plassen/*SV*.parquet"):
    afile = afile.replace("\\", "/")
    print(afile)
    
    print(pd.read_parquet(afile)['label'].value_counts())
    
    print(afile.split("_test")[0].split("_")[-1])
    if pd.read_parquet(afile)['label'].value_counts().index[0] != afile.split("_test")[0].split("_")[-1]:
        print("Wrong!!!!!!!")
        falses= falses+1

print("False rating off: "+str(falses/len(glob.glob("E:/output/test/Nieuwkoopse_plassen/*SV*.parquet"))))

E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.parquet
label
Ground    6303
Name: count, dtype: int64
Ground
E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.parquet
label
Water    29559
Name: count, dtype: int64
Water
E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.parquet
label
Ground    4260
Name: count, dtype: int64
Ground
E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.parquet
label
Water    44177
Name: count, dtype: int64
Water
E:/output/test/Nieuwkoopse_plassen/20190416_113410_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.parquet
label
Ground    1714
Name: count, dtype: int64
Ground
E:/output/test/Nieuwkoopse_pl

In [25]:
import rasterio

In [26]:
def raster_to_dataframe(a_tif_file):
    src = rasterio.open(a_tif_file)
    data  = src.read() 
    z_shape = data.shape[0]
    x_shape = data.shape[1]
    y_shape = data.shape[2]

    x_coordinates = [
            [ x for y in range(0, data.shape[2])]
            for x in range(0, data.shape[1])
        ]
    y_coordinates = [
                [y for y in range(0, data.shape[2])] for x in range(0, data.shape[1])]

    rd_x, rd_y = rasterio.transform.xy(
                src.transform, x_coordinates, y_coordinates
            )
    
    data = np.append(data, rd_x).reshape([z_shape + 1, x_shape, y_shape])
    data = np.append(data, rd_y).reshape([z_shape + 2, x_shape, y_shape])

    data = data.reshape(-1, x_shape * y_shape).transpose()

    df = pd.DataFrame(
                    data,
                    columns=selected_features + ["rd_x", "rd_y"],
                )

    return df[(df[["r", "g", "b"]] != 0).any(axis="columns")]

In [34]:
# Test directly from tif files
falses = 0
for afile in glob.glob("E:/output/test/Nieuwkoopse_plassen/*SV*.tif"):
    afile = afile.replace("\\", "/")
    print(afile)

    df = raster_to_dataframe(afile)
    
    df["filename"] =  afile
    df["date"] =  afile.split("/")[-1][0:15]
    df["label"] = afile.split("_test")[0].split("_")[-1]
    
    print(afile.split("_test")[0].split("_")[-1])
    print(pd.Series(final_artefact['model'].predict(final_artefact['scaler'].transform(df[selected_features]))).value_counts().index[0])
    if pd.Series(final_artefact['model'].predict(final_artefact['scaler'].transform(df[selected_features]))).value_counts().index[0] != afile.split("_test")[0].split("_")[-1]:
        print("Wrong!!!!!!!")
        falses= falses+1

print("False rating off: "+str(falses/len(glob.glob("E:/output/test/Nieuwkoopse_plassen/*SV*.tif"))))

E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
Ground
Ground
E:/output/test/Nieuwkoopse_plassen/20190302_113613_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
Water
Water
E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
Ground
Ground
E:/output/test/Nieuwkoopse_plassen/20190409_111030_SV1-04_50cm_RD_11bit_RGBI_Mijdrecht_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
Water
Water
E:/output/test/Nieuwkoopse_plassen/20190416_113410_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Schippersgat_cropped_ndwi_re_ndvi_Ground_test.tif
Ground
Ground
E:/output/test/Nieuwkoopse_plassen/20190416_113410_SV1-02_50cm_RD_11bit_RGBI_Nieuwveen_Schippersgat_cropped_ndwi_re_ndvi_Water_test.tif
Water
Water
E:/output/test/Nieuwkoopse_plassen/20190629_113712_SV1-02_50cm_RD_11bit_RGBI_Mijdrecht_Nieuwkoopse_Plas

# NDWI and NDVI do not match, somehow gets twisted!

In [86]:
df[(df["rd_y"]  == 463906.25) & (df["rd_x"] == 117219.25)][selected_features]

Unnamed: 0,r,g,b,i,ndvi,ndwi
231,646.0,458.0,358.0,273.0,125.0,59.0


In [87]:
df_annotated_export_data[(df_annotated_export_data["rd_y"]  == 463906.25) & (df_annotated_export_data["rd_x"] == 117219.25)][selected_features]

Unnamed: 0,r,g,b,i,ndvi,ndwi
9982,646.0,458.0,358.0,273.0,59.0,125.0


In [80]:
final_model.predict(final_scaler.transform(df_annotated_export_data[(df_annotated_export_data["rd_y"]  == 463906.25) & (df_annotated_export_data["rd_x"] == 117219.25)][selected_features]))

array(['Water'], dtype=object)