In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import sys
import os
import pandas as pd
from enum import Enum
import pickle
from pprint import pprint

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
sys.path.append("..")

from src.cloud_recognition.model_training import train_test_split_filenames
from src.cloud_recognition.data_loaders import FlattenedRGBImageLoader
from src.cloud_recognition.data_preparation import DataPreprocessor
from src.cloud_recognition.features import selected_features, all_features

In [4]:
# Set variables
filenames_filepath = "satellite-images-clouds.csv"
folder_path = "E:/Data/remote_sensing/satellite-images"
column_to_train_test_split_on = "location"
filepath_cached_data = "cache/filenames_df.csv"

In [5]:
# This loads the cached results of data_preparation, as it takes a long time
if os.path.exists(filepath_cached_data):
    filenames_df = pd.read_csv(filepath_cached_data)
else:
    filenames_df = pd.read_csv(filenames_filepath)

    data_preprocessor = DataPreprocessor(features_to_generate=all_features)
    features_df = filenames_df.apply(
        lambda row: data_preprocessor.transform(
            FlattenedRGBImageLoader(filename=row["filename"], folder_path=folder_path).get_rgb_df()
            ),
            axis=1)
    filenames_df = filenames_df.join(features_df)
    filenames_df.to_csv(filepath_cached_data, index=False)

In [6]:
# The choice was made to not split on the location column, as a model per location is desirable
train_df, test_df = train_test_split_filenames(filenames_df, column_to_split_on=None, random_state=1000, train_size=0.7)

In [10]:
# By removing highly correlated (>0.95) features manually + checking the effect on accuracy for Linear Models (below), we come to the following list of uncorrelated & important features
uncorrelated_features = [str(feature) for feature in selected_features]
pprint(uncorrelated_features)

train_df[[str(feature) for feature in all_features]].corr()

['fraction_bright_500',
 'fraction_bright_700',
 'fraction_relative_bright_0.6',
 'fraction_relative_bright_0.8',
 'fraction_relative_bright_0.9',
 'fraction_relative_bright_0.95',
 'fraction_relative_bright_0.99',
 'fraction_green_bright_500',
 'red_quantile_0.99',
 'number_bright_pixels_500',
 'number_bright_pixels_700']


Unnamed: 0,fraction_bright_500,fraction_bright_600,fraction_bright_700,fraction_bright_800,fraction_bright_900,fraction_bright_1000,fraction_bright_1100,fraction_bright_1200,fraction_relative_bright_0.5,fraction_relative_bright_0.6,...,number_bright_pixels_1000,number_bright_pixels_1100,number_bright_pixels_1200,number_of_pixels,fraction_bright_from_max_0.5,fraction_bright_from_max_0.6,fraction_bright_from_max_0.7,fraction_bright_from_max_0.8,fraction_bright_from_max_0.9,fraction_bright_from_max_0.99
fraction_bright_500,1.000000,0.922186,0.851619,0.818962,0.809043,0.807429,0.806714,0.806711,0.354054,0.287404,...,0.807376,0.806709,0.806709,-0.236830,0.820145,0.807335,0.806216,0.806688,0.806706,0.007449
fraction_bright_600,0.922186,1.000000,0.984354,0.965210,0.959509,0.958795,0.958445,0.958443,0.232713,0.168769,...,0.958779,0.958443,0.958442,-0.180800,0.954567,0.958373,0.958226,0.958433,0.958439,-0.068493
fraction_bright_700,0.851619,0.984354,1.000000,0.994176,0.991132,0.990696,0.990462,0.990461,0.162177,0.096207,...,0.990697,0.990462,0.990460,-0.143183,0.981636,0.990130,0.990351,0.990457,0.990459,-0.090223
fraction_bright_800,0.818962,0.965210,0.994176,1.000000,0.999499,0.999279,0.999140,0.999140,0.126433,0.052809,...,0.999280,0.999140,0.999140,-0.113385,0.989319,0.998561,0.999063,0.999139,0.999139,-0.106879
fraction_bright_900,0.809043,0.959509,0.991132,0.999499,1.000000,0.999964,0.999917,0.999917,0.116803,0.041973,...,0.999963,0.999917,0.999917,-0.106649,0.989920,0.999202,0.999847,0.999916,0.999917,-0.110642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fraction_bright_from_max_0.6,0.807335,0.958373,0.990130,0.998561,0.999202,0.999244,0.999249,0.999250,0.113777,0.041110,...,0.999234,0.999249,0.999249,-0.102518,0.993022,1.000000,0.999506,0.999260,0.999251,-0.101163
fraction_bright_from_max_0.7,0.806216,0.958226,0.990351,0.999063,0.999847,0.999922,0.999934,0.999935,0.112338,0.037816,...,0.999916,0.999934,0.999934,-0.105826,0.990441,0.999506,1.000000,0.999939,0.999935,-0.105339
fraction_bright_from_max_0.8,0.806688,0.958433,0.990457,0.999139,0.999916,0.999989,1.000000,1.000000,0.113702,0.038866,...,0.999984,1.000000,1.000000,-0.105110,0.989944,0.999260,0.999939,1.000000,1.000000,-0.111673
fraction_bright_from_max_0.9,0.806706,0.958439,0.990459,0.999139,0.999917,0.999990,1.000000,1.000000,0.113751,0.038905,...,0.999984,1.000000,1.000000,-0.105089,0.989926,0.999251,0.999935,1.000000,1.000000,-0.111892


In [11]:
class Location(Enum):
    COEPELDUYNEN = "coepelduynen"
    DUINENGOEREEKWADEHOEK = "duinengoereekwadehoek"
    VOORNESDUIN = "voornesduin"

class ModelType(Enum):
    BASELINE = "baseline"
    LINEAR_MODEL = "linear_model"

class Natura2000CloudDetectionModel:
    def __init__(self, model_type: ModelType, locations: list, linear_models: dict = None, pca_n_components: dict = None):
        self.locations = locations
        self.model_type = model_type
        if model_type == ModelType.LINEAR_MODEL:
            self.scalers = {loc: StandardScaler() for loc in self.locations}
            self.pcas = {loc: PCA(n_components=pca_n_components[loc]) for loc in self.locations}
            self.models = linear_models
        

    def predict(self, df: pd.DataFrame) -> pd.Series:
        outputs = []
        for loc in self.locations:
            location_mask = df["location"] == loc.value
            if self.model_type == ModelType.BASELINE:
                location_output = self.predict_baseline(df[location_mask])
            elif self.model_type == ModelType.LINEAR_MODEL:
                location_output = self.predict_linear_model(df[location_mask], location=loc)
            outputs += [location_output]
        output = pd.concat(outputs)
        return output.loc[df.index]
    
    def predict_baseline(self, df: pd.DataFrame) -> pd.Series:
        # Pick majority class
        return pd.Series([False] * len(df), index=df.index)
        
    def predict_linear_model(self, df: pd.DataFrame, location: Location) -> pd.Series:
        location_df = df.drop("location", axis=1)
        location_df = self.scalers[location].transform(location_df)
        location_df = self.pcas[location].transform(location_df)
        return pd.Series(self.models[location].predict(location_df), index=df.index)
            
        
    def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
        for loc in self.locations:
            location_mask = X_train["location"] == loc.value
            location_df = X_train[location_mask]
            location_df = location_df.drop("location", axis=1)
            location_y = y_train[location_mask]

            location_df = self.scalers[loc].fit_transform(location_df)
            location_df = self.pcas[loc].fit_transform(location_df)
            self.models[loc].fit(location_df, location_y)


In [14]:

print("Baseline model results:")
feature_columns_to_fit = uncorrelated_features + ["location"]
X_train = train_df[feature_columns_to_fit]
y_train = train_df["clouds"].astype(int)
X_test = test_df[feature_columns_to_fit]
y_test = test_df["clouds"].astype(int)

model = Natura2000CloudDetectionModel(model_type=ModelType.BASELINE, locations=[loc for loc in Location])

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("train confusion matrix")
print(confusion_matrix(y_true=y_train, y_pred=y_pred_train))
print(accuracy_score(y_train, y_pred_train))
print("test confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=y_pred_test))
print(accuracy_score(y_test, y_pred_test))

Baseline model results:
train confusion matrix
[[32  0]
 [21  0]]
0.6037735849056604
test confusion matrix
[[14  0]
 [ 9  0]]
0.6086956521739131


In [15]:
grid_search_parameters = {
    "model": [LogisticRegression, DecisionTreeClassifier],
    "pca_n_components": range(1,10),
    "location": [loc for loc in Location]
}

In [16]:
for location in grid_search_parameters["location"]:
    loc_train_mask = train_df["location"] == location.value
    loc_test_mask = test_df["location"] == location.value

    feature_columns_to_fit = uncorrelated_features + ["location"]

    X_train = train_df[loc_train_mask][feature_columns_to_fit]
    y_train = train_df[loc_train_mask]["clouds"].astype(int)
    X_test = test_df[loc_test_mask][feature_columns_to_fit]
    y_test = test_df[loc_test_mask]["clouds"].astype(int)
    for mod in grid_search_parameters["model"]:
        for pca_n_components in grid_search_parameters["pca_n_components"]:
            locations = [location]
            linear_models = {location: mod()}
            pca_n_components_dict = {location: pca_n_components}

            model = Natura2000CloudDetectionModel(model_type=ModelType.LINEAR_MODEL, locations=locations, linear_models=linear_models, pca_n_components=pca_n_components_dict)
            model.fit(X_train, y_train)

            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)

            print(f"Parameters: location: {location}, linear_model = {mod}, pca_n_components = {pca_n_components}")
            print(f"Train accuracy: {accuracy_score(y_train, y_pred_train)}")
            print(f"Test accuracy: {accuracy_score(y_test, y_pred_test)}")


Parameters: location: Location.COEPELDUYNEN, linear_model = <class 'sklearn.linear_model._logistic.LogisticRegression'>, pca_n_components = 1
Train accuracy: 0.7368421052631579
Test accuracy: 1.0
Parameters: location: Location.COEPELDUYNEN, linear_model = <class 'sklearn.linear_model._logistic.LogisticRegression'>, pca_n_components = 2
Train accuracy: 0.8421052631578947
Test accuracy: 1.0
Parameters: location: Location.COEPELDUYNEN, linear_model = <class 'sklearn.linear_model._logistic.LogisticRegression'>, pca_n_components = 3
Train accuracy: 0.8947368421052632
Test accuracy: 1.0
Parameters: location: Location.COEPELDUYNEN, linear_model = <class 'sklearn.linear_model._logistic.LogisticRegression'>, pca_n_components = 4
Train accuracy: 0.9473684210526315
Test accuracy: 0.9
Parameters: location: Location.COEPELDUYNEN, linear_model = <class 'sklearn.linear_model._logistic.LogisticRegression'>, pca_n_components = 5
Train accuracy: 0.9473684210526315
Test accuracy: 1.0
Parameters: location

In [17]:
# Best performing outcomes per Location:
# Coepelduynen -> LogisticRegression, pca_n_components = 5
# Duinen Goeree Kwade Hoek -> LogisticRegression, pca_n_components = 6
# Voornes Duin -> LogisticRegression, pca_n_components = 5

In [18]:
feature_columns_to_fit = uncorrelated_features + ["location"]
X_train = train_df[feature_columns_to_fit]
y_train = train_df["clouds"].astype(int)
X_test = test_df[feature_columns_to_fit]
y_test = test_df["clouds"].astype(int)

linear_models = {
    Location.COEPELDUYNEN: LogisticRegression(),
    Location.DUINENGOEREEKWADEHOEK: LogisticRegression(),
    Location.VOORNESDUIN: LogisticRegression()
}
pca_n_components = {
    Location.COEPELDUYNEN: 5,
    Location.DUINENGOEREEKWADEHOEK: 6,
    Location.VOORNESDUIN: 5
}
locations = [loc for loc in Location]

model = Natura2000CloudDetectionModel(model_type=ModelType.LINEAR_MODEL, locations=locations, linear_models=linear_models, pca_n_components=pca_n_components)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("train confusion matrix")
print(confusion_matrix(y_true=y_train, y_pred=y_pred_train))
print(accuracy_score(y_train, y_pred_train))
print("test confusion matrix")
print(confusion_matrix(y_true=y_test, y_pred=y_pred_test))
print(accuracy_score(y_test, y_pred_test))

train confusion matrix
[[32  0]
 [ 2 19]]
0.9622641509433962
test confusion matrix
[[13  1]
 [ 0  9]]
0.9565217391304348


In [19]:
# Conclusion:
# The Logistic Regression model works quite well. It increases accuracy from the 60% (baseline model) to 95%. 
# Difference between train & accuracy accuracy is negligible, suggesting there is no overfitting happening.
# This seems like a good additional functionality for the extractor to warn for images with too much clouds.

In [20]:
# Definitive model is trained on all available data and saved, following this it is uploaded by hand to `pzh-blob-satelliet` blob storage to container 'satellite-images-nso' and folder 'cloud_detection_models'
feature_columns_to_fit = uncorrelated_features + ["location"]
X_train = filenames_df[feature_columns_to_fit]
y_train = filenames_df["clouds"].astype(int)

linear_models = {
    Location.COEPELDUYNEN: LogisticRegression(),
    Location.DUINENGOEREEKWADEHOEK: LogisticRegression(),
    Location.VOORNESDUIN: LogisticRegression()
}
pca_n_components = {
    Location.COEPELDUYNEN: 5,
    Location.DUINENGOEREEKWADEHOEK: 6,
    Location.VOORNESDUIN: 5
}
locations = [loc for loc in Location]

model = Natura2000CloudDetectionModel(model_type=ModelType.LINEAR_MODEL, locations=locations, linear_models=linear_models, pca_n_components=pca_n_components)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

print("train confusion matrix")
print(confusion_matrix(y_true=y_train, y_pred=y_pred_train))
print(accuracy_score(y_train, y_pred_train))

filename = '../saved_models/cloud_detection_logistic_regression_v1.0.sav'
pickle.dump(model, open(filename, 'wb'))

train confusion matrix
[[46  0]
 [ 3 27]]
0.9605263157894737
