In [None]:
%load_ext autoreload

In [4]:
%autoreload

In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin
from skimage.io import concatenate_images

In [6]:
import sys
sys.path.append("../")
from config import DATASETS_PATH, TRAIN_PATH, VALIDATION_PATH
from helpers.utils import extract_label_values, load_image_collection

# Load data

In [7]:
train_imgs = load_image_collection(TRAIN_PATH)
validation_imgs = load_image_collection(VALIDATION_PATH)

# Feature extraction

## spectral features

In [9]:
# TODO: move classes extracting features to separate python file
class BaseFeatureExtractor(TransformerMixin):
    def __init__(self):
        self.pixels_axis = (1, 2)
    
    def fit(self, imgs, y=None):
        raise NotImplementedError
    
    def transform(self, imgs, y=None):
        raise NotImplementedError

class SpectralFeatureExtractor(BaseFeatureExtractor):
    """
    extracts mean and standard deviation of every color channel in the image (RGB)
    and the brightness, where brightness is defined as the mean of all color channels

    Parameters
    ----------
    imgs : numpy.ndarray (np.float)
           set of images, each with 4 channels (B, G, R, NIR)
    """
    def __init__(self):
        super().__init__()
    
    def fit(self, imgs, y=None):
        return self
    
    def transform(self, imgs, y=None):
        imgs = imgs[:, :, :, :3] # extract color channels
        rgb_means = np.mean(imgs, axis=self.pixels_axis)
        brightness = np.mean(rgb_means, axis=1)
        brightness = np.reshape(brightness, (-1, 1))
        rgb_sds = np.std(imgs, axis=self.pixels_axis)

        return np.concatenate((rgb_means, brightness, rgb_sds), axis=1)

class NDVIFeatureExtractor(BaseFeatureExtractor):
    """
    extracts normalized difference vegatation index from multispectral image

    Parameters
    ----------
    imgs : numpy.ndarray (np.float)
           set of images, each with 4 channels (B, G, R, NIR)
    """
    def __init__(self):
        super().__init__()
    
    def fit(self, imgs, y=None):
        return self
    
    def transform(self, imgs, y=None):
        red = imgs[:, :, :, 2]
        nir = imgs[:, :, :, 3]

        ndvi = np.divide(nir-red, nir+red)
        
        ndvi_means = np.mean(ndvi, axis=self.pixels_axis)
        ndvi_sds = np.std(ndvi, axis=self.pixels_axis)
        ndvi_means = np.reshape(ndvi_means, (-1, 1))
        ndvi_sds = np.reshape(ndvi_sds, (-1, 1))
        return np.concatenate((ndvi_means, ndvi_sds), axis=1)
        

In [10]:
feature_extractor = FeatureUnion(transformer_list=[
    ("spectral", SpectralFeatureExtractor()),
    ("ndvi", NDVIFeatureExtractor())
])

In [54]:
def extract_features(imgs_collection, feature_extractor, batch_size=1000):
    """
    Extracts from imgs_collection the set of features specified in feature_extractor.
    Extracts the features in batches because it is unviable to load all images at once into memory
    
    Parameters
    ----------
    imgs_collection : skimage.ImageCollection
                      collection of images from which we want to extract the features
    feature_extractor: sklearn transformer
                       transformers that extract features from images
    batch_size: number of images to extract features from at each iteration
                a deafult value of 1000 loads approx. 0.5 GB into memory for this dataset
    
    Returns
    -------
    features: numpy.ndarray (np.float64)
              set of features extracted from the image, 
              with one row for each image and one column for each feature
    """
    
    n_images = len(imgs_collection)
    # get number of total features
    features_im0 = feature_extractor.fit_transform(concatenate_images(imgs_collection[:1]))
    n_features = np.shape(features_im0)[1]
    # create array for features
    features = np.zeros([n_images, n_features])
    features[0, :] = features_im0
    for i in range(1, n_images, batch_size):
        imgs_batch = concatenate_images(imgs_collection[i:i+batch_size])
        # casting is required for feature extraction! 
        # else the default uint16 produces errors in the computations
        imgs_batch = imgs_batch.astype('float64', casting='safe')
        features_batch = feature_extractor.fit_transform(imgs_batch)
        features[i:i+batch_size, :] = features_batch
    return features

In [None]:
train_features = extract_features(train_imgs, feature_extractor)

In [55]:
validation_features = extract_features(validation_imgs, feature_extractor)

# model

In [57]:
classifier = RandomForestClassifier(n_estimators=500)

In [58]:
train_labels = pd.read_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'))
train_labels = extract_label_values(train_labels)

In [59]:
classifier.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [89]:
validation_predictions = classifier.predict(validation_features)

In [75]:
np.sum(predictions, axis=0)

array([7.644e+03, 5.993e+03, 2.271e+03, 1.219e+03, 9.230e+02, 1.125e+03,
       2.300e+02, 2.000e+02, 3.370e+02, 3.310e+02, 6.000e+00, 0.000e+00,
       2.800e+01, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00])

# evaluation

In [61]:
from sklearn.metrics import fbeta_score

In [92]:
validation_labels_df = pd.read_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'))
validation_labels = extract_label_values(validation_labels_df)

In [87]:
def evaluate_performance(labels, predictions, beta=2):
    mean_f2 = fbeta_score(labels, predictions, beta, average='samples')
    per_class_f2 = fbeta_score(labels, predictions, beta, average=None)
    return mean_f2, per_class_f2

In [95]:
validation_labels_df.shape

(8096, 19)

In [96]:
validation_labels_df.iloc[0, :]

image_name                train_0
tags                 primary haze
primary                         1
clear                           0
agriculture                     0
road                            0
water                           0
partly_cloudy                   0
cultivation                     0
habitation                      0
haze                            1
cloudy                          0
bare_ground                     0
selective_logging               0
artisinal_mine                  0
blooming                        0
slash_burn                      0
conventional_mine               0
blow_down                       0
Name: 0, dtype: object

In [99]:
# baseline
baseline_predictions_df = validation_labels_df.copy(deep=False)

In [101]:
baseline_predictions_df.iloc[0, :]

image_name                train_0
tags                 primary haze
primary                         1
clear                           0
agriculture                     0
road                            0
water                           0
partly_cloudy                   0
cultivation                     0
habitation                      0
haze                            1
cloudy                          0
bare_ground                     0
selective_logging               0
artisinal_mine                  0
blooming                        0
slash_burn                      0
conventional_mine               0
blow_down                       0
Name: 0, dtype: object

In [88]:
mean_f2, per_class_f2 = evaluate_performance(validation_labels, validation_predictions)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
