# Retrieve the data files for reading in

In [1]:
# Import matplot for plotting later on
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Specify the path to our data
dataPath = "../../data/"

In [3]:
# Walk the data folder, returning a list of all the files
import os
rasterBin = []
for root, dirs, files in os.walk(dataPath, topdown=False):
   for file in files:
        if ".hdr" not in file:
            rasterBin.append(os.path.join(dataPath, file))

print("files retrieved")

files retrieved


# Read in the original image and truth labeled images for classification

In [4]:
# Library for working with raster images
import rasterio
import numpy as np
from rasterio import plot

In [5]:
import pandas as pd
data_frame = pd.DataFrame(columns=(
        'S2_coastal_aerosol',
        'S2_blue',
        'S2_green',
        'S2_red',
        'S2_vre1',
        'S2_vre2',
        'S2_vre3',
        'S2_nir',
        'S2_narrow_nir',
        'S2_water_vapour',
        'S2_swir_cirrus',
        'S2_swir2',
        'L8_coastal_aerosol',
        'L8_blue',
        'L8_green',
        'L8_red',
        'L8_near_infrared',
        'L8_shortwave_infrared1',
        'L8_shortwave_infrared2',
        'L8_panchromatic',
        'L8_cirrus',
        'L8_longwave_infrared1',
        'L8_longwave_infrared2',
        'water_val',
        'water_bool',
        'river_val',
        'river_bool',
        'broadleaf_val',
        'broadleaf_bool',
        'shrub_val',
        'shrub_bool',
        'mixed_val',
        'mixed_bool',
        'conifer_val',
        'conifer_bool',
        'herb_val',
        'herb_bool',
        'clearcut_val',
        'clearcut_bool',
        'exposed_val',
        'exposed_bool'))

In [6]:
    for raster in rasterBin:

        if "S2A.bin_4x.bin_sub.bin" in raster:

            dataset = rasterio.open(raster)
            for idx in dataset.indexes:
                """
                reads in the current band which is a mat of 401, 410 and ravels it
                storing the result in the current column. ie(X values)
                """
                data_frame.iloc[:, idx-1] = dataset.read(idx).ravel()
        elif "L8.bin_4x.bin_sub.bin" in raster:
            dataset = rasterio.open(raster)
            for idx in dataset.indexes:
                data_frame.iloc[:, idx+11] = dataset.read(idx).ravel()

        elif "WATERSP.tif_project_4x.bin_sub.bin" in raster:
            water = rasterio.open(raster).read(1)
            data_frame['water_val'] = water.ravel()
            data_frame['water_bool'] = data_frame['water_val'] != 128

        elif "RiversSP.tif_project_4x.bin_sub.bin" in raster:
            river = rasterio.open(raster).read(1)
            data_frame['river_val'] = river.ravel()
            data_frame['river_bool'] = data_frame['river_val'] == 1.0

        elif "BROADLEAF_SP.tif_project_4x.bin_sub.bin" in raster:
            broadleaf = rasterio.open(raster).read(1)
            data_frame['broadleaf_val'] = broadleaf.ravel()
            data_frame['broadleaf_bool'] = data_frame['broadleaf_val'] == 1.0

        elif "SHRUB_SP.tif_project_4x.bin_sub.bin" in raster:
            shrub = rasterio.open(raster).read(1)
            data_frame['shrub_val'] = shrub.ravel()
            data_frame['shrub_bool'] = data_frame['shrub_val'] != 0.0

        elif "MIXED_SP.tif_project_4x.bin_sub.bin" in raster:
            mixed = rasterio.open(raster).read(1)
            data_frame['mixed_val'] = mixed.ravel()
            data_frame['mixed_bool'] = data_frame['mixed_val'] != 0.0

        elif "CONIFER_SP.tif_project_4x.bin_sub.bin" in raster:
            conifer = rasterio.open(raster).read(1)
            data_frame['conifer_val'] = conifer.ravel()
            data_frame['conifer_bool'] = data_frame['conifer_val'] != 0.0

        elif "HERB_GRAS_SP.tif_project_4x.bin_sub.bin" in raster:
            herb = rasterio.open(raster).read(1)
            data_frame['herb_val'] = herb.ravel()
            data_frame['herb_bool'] = data_frame['herb_val'] != 0.0

        elif "CCUTBL_SP.tif_project_4x.bin_sub.bin" in raster:
            clearcut = rasterio.open(raster).read(1)
            data_frame['clearcut_val'] = clearcut.ravel()
            data_frame['clearcut_bool'] = data_frame['clearcut_val'] != 0.0

        elif "EXPOSED_SP.tif_project_4x.bin_sub.bin" in raster:
            exposed = rasterio.open(raster).read(1)
            data_frame['exposed_val'] = exposed.ravel()
            data_frame['exposed_bool'] = data_frame['exposed_val'] != 0.0

In [9]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', None):
    print(data_frame)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
X_true = data_frame[data_frame['label_water_bool'] == True]
X_false = data_frame[data_frame['label_water_bool'] == False].sample(len(X_true))

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
undersample_water_frames = [X_true, X_false]
X_full = pd.concat(undersample_water_frames) # Concat the undersampled true and false pixels
X = X_full.loc[:, : 'swir2'] # only considers the columns up to swir2

In [12]:
X_norm = scaler.fit_transform(X)

In [13]:
y = X_full['label_water_bool']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, random_state=0, test_size=0.2)

In [15]:
from sklearn.linear_model import SGDClassifier
sgd_classifier = SGDClassifier(random_state=42, verbose=0, warm_start=True)
y_pred = sgd_classifier.fit(X_train, y_train).predict(X_test)

In [16]:
print("Test score: {:.2f}".format(sgd_classifier.score(X_test, y_test)))

Test score: 0.93


In [17]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [18]:
print("True Positive:", tp )
print("True Negative:", tn )
print("False Positive:", fp )
print("False Negative:", fn )

True Positive: 1150
True Negative: 1318
False Positive: 55
False Negative: 127
