In [None]:
%load_ext autoreload

In [4]:
%autoreload

In [5]:
import os
import numpy as np
import pandas as pd
from skimage.io import concatenate_images
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier

In [6]:
import sys
sys.path.append("../")
from config import DATASETS_PATH, TRAIN_PATH, VALIDATION_PATH
from helpers.utils import extract_label_values, load_image_collection

# Load data

# Feature extraction

## spectral features

In [10]:
feature_extractor = FeatureUnion(transformer_list=[
    ("spectral", SpectralFeatureExtractor()),
    ("ndvi", NDVIFeatureExtractor())
])

# model

In [None]:
def complete_pipeline(feature_extractor):
    """
    1. extract features of the training and validation set, using the feature_extractor argument
    2. train model on training data
    3. make predictions on validation data
    4. evaluate performance
    
    Parameters
    ----------
    feature_extractor: sklearn transformer
                       transformers that extract features from images
    """
    # load data
    train_imgs = load_image_collection(TRAIN_PATH)
    validation_imgs = load_image_collection(VALIDATION_PATH)
    
    train_labels = pd.read_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'))
    train_labels = extract_label_values(train_labels)
    
    validation_labels = pd.read_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'))
    validation_labels = extract_label_values(validation_labels)
    # extract features
    train_features = extract_features(train_imgs, feature_extractor)
    validation_features = extract_features(validation_imgs, feature_extractor)

    # train model
    classifier = RandomForestClassifier(n_estimators=500)
    classifier.fit(train_features, train_labels)
    
    # make predictions
    validation_predictions = classifier.predict(validation_features)
    
    # evaluate performance
    mean_f2, per_class_f2 = evaluate_performance(validation_labels, validation_predictions)
    
    return mean_f2, per_class_f2

# evaluation

In [61]:
from sklearn.metrics import fbeta_score

In [92]:
validation_labels_df = pd.read_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'))
validation_labels = extract_label_values(validation_labels_df)

In [87]:
def evaluate_performance(labels, predictions, beta=2):
    mean_f2 = fbeta_score(labels, predictions, beta, average='samples')
    per_class_f2 = fbeta_score(labels, predictions, beta, average=None)
    return mean_f2, per_class_f2

In [104]:
# baseline
baseline_predictions_df = validation_labels_df.copy()

In [105]:
# set all label columns to 0
for col in baseline_predictions_df.columns:
    if np.issubdtype(baseline_predictions_df[col], np.number):
        baseline_predictions_df[col].values[:] = 0
# label all instances with most common label
baseline_predictions_df['clear'] = 1 # each instance must have one atmospheric condition label
baseline_predictions_df['primary'] = 1 # each non-cloudy image must have at least one land use label

In [108]:
baseline_predictions = extract_label_values(baseline_predictions_df)

In [114]:
mean_f2_baseline, per_class_f2_baseline = evaluate_performance(validation_labels, baseline_predictions)

In [116]:
mean_f2, per_class_f2 = evaluate_performance(validation_labels, validation_predictions)

  'precision', 'predicted', average, warn_for)
