## Notebook to develop supervised classification algorithm for identifying snow in PlanetScope 4-band imagery
Rainey Aberle

Adapted from the [SciKit Learn Classifier comparison tutorial](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)

### Initial setup

In [None]:
# -----Import packages
import os
import glob
import numpy as np
import rasterio as rio
import geopandas as gpd
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics 
import matplotlib.pyplot as plt
import matplotlib
import time
import sys
import pickle

In [None]:
# -----Determine whether to save outputs to file
save_outputs = False # = True to save output figures and best classifier 

# -----Define paths in directory
# base directory (path to planet-snow/)
base_path = '/Users/raineyaberle/Research/PhD/planet-snow/'
# output folder for best classifier
out_path = base_path+'inputs-outputs/'

# -----Add path to functions
sys.path.insert(1, base_path+'functions/')
from classification_utils import crop_images_to_AOI, classify_image, calculate_SCA

### Define supervised classification algorithms to test 

In [None]:
# -----Classifier names
names = [
#     "Gaussian Process", # keeps crashing kernel when classifying images!
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression"
]

# -----Classifiers
classifiers = [
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(random_state = 0)
]

### Test site-specific classifiers

Use classified points at each site to determine the best site-specific classifier, calculate accuracy

Classified points split: 80% training, 20% testing

In [None]:
# -----Define site IDs
# Wolverine (WO), SitKusa (SK), Mendenhall (ME), Easton (EA), Blue (BL), Emmons (EM)
site_IDs = ['WO', 'SK', 'ME', 'EA', 'BL', 'EM']
site_names = ['Wolverine', 'SitKusa', 'Mendenhall', 'Easton', 'Blue', 'Emmons']

# -----Define images
im_fns = [('20190626_21_adj.tif', '20190819_21_adj.tif'), # Wolverine
         ('20190805_20_adj.tif'), # Sit Kusa
         ('20210823_20_adj.tif'), # Mendenhall
         ('20181020_17_adj.tif', '20200829_19_adj.tif'), # Easton
         ('20210703_18_adj.tif', '20210924_18_adj.tif'), #Blue
         ('20210604_18_adj.tif', '20210828_19_adj.tif')] # Emmons

# -----Initialize data_pts for single classifier (next step)
data_pts_full = gpd.GeoDataFrame()

# -----Loop through sites
for i in range(len(site_names)):
    
    print('----------')
    print(site_names[i])
    
    # path to images
    im_path = base_path+'../study-sites/'+site_names[i]+'/imagery/Planet/adjusted-filtered/'

    if len(im_fns[i])==2:
        
        # Image 1
        im1_fn = im_fns[i][0] # image 1 file name
        im1_date = im1_fn[0:4]+'-'+im1_fn[4:6]+'-'+im1_fn[6:8] # image 1 capture date
        im1 = rio.open(im_path+im1_fn) # open image 1
        epsg = str(im1.crs)[5:] # grab EPSG code
        # read bands
        b1 = im1.read(1).astype(float) 
        r1 = im1.read(2).astype(float) 
        g1 = im1.read(3).astype(float) 
        nir1 = im1.read(4).astype(float) 
        # divide by image scalar if max band values > 1000
        if np.nanmax(b1) > 1e3:
            apply_scalar = True
            im_scalar = 10000
            b1 = b1 / im_scalar
            g1 = g1 / im_scalar
            r1 = r1 / im_scalar
            nir1 = nir1 / im_scalar
        else:
            apply_scalar = False
        # calculate NDSI
        ndsi1 = (r1 - nir1) / (r1 + nir1)
        # define coordinates grid
        im1_x = np.linspace(im1.bounds.left, im1.bounds.right, num=np.shape(b1)[1])
        im1_y = np.linspace(im1.bounds.top, im1.bounds.bottom, num=np.shape(b1)[0])
        # load snow training points
        data_snow_pts1_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_snow_points_'+im1_fn[:-8]+'.shp'
        data_snow_pts1 = gpd.read_file(data_snow_pts1_fn) 
        data_snow_pts1 = data_snow_pts1.to_crs(epsg) # reproject to image CRS
        # load non-snow points
        data_non_snow_pts1_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_non_snow_points_'+im1_fn[:-8]+'.shp'
        data_non_snow_pts1 = gpd.read_file(data_non_snow_pts1_fn)
        data_non_snow_pts1 = data_non_snow_pts1.to_crs(epsg) # reproject to image CRS
        
        # Image 2
        im2_fn = im_fns[i][1] # image 2 file name
        im2_date = im2_fn[0:4]+'-'+im2_fn[4:6]+'-'+im2_fn[6:8] # image 1 capture date
        im2 = rio.open(im_path+im2_fn) # open image 1
        # read bands
        b2 = im2.read(1).astype(float)
        r2 = im2.read(2).astype(float) 
        g2 = im2.read(3).astype(float) 
        nir2 = im2.read(4).astype(float) 
        if apply_scalar==True:
            b2 = b2 / im_scalar
            g2 = g2 / im_scalar
            r2 = r2 / im_scalar
            nir2 = nir2 / im_scalar
        # calculate NDSI
        ndsi2 = (r2 - nir2) / (r2 + nir2)
        # define coordinates grid
        im2_x = np.linspace(im2.bounds.left, im2.bounds.right, num=np.shape(b2)[1])
        im2_y = np.linspace(im2.bounds.top, im2.bounds.bottom, num=np.shape(b2)[0])
        # load snow training points
        data_snow_pts2_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_snow_points_'+im2_fn[:-8]+'.shp'
        data_snow_pts2 = gpd.read_file(data_snow_pts2_fn)
        # reproject to defined CRS
        data_snow_pts2 = data_snow_pts2.to_crs(epsg)
        # load non-snow points
        data_non_snow_pts2_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_non_snow_points_'+im2_fn[:-8]+'.shp'
        data_non_snow_pts2 = gpd.read_file(data_non_snow_pts2_fn)
        # Reproject to defined CRS
        data_non_snow_pts2 = data_non_snow_pts2.to_crs(epsg)        
        
        # Plot RGB images, data point locations, and band histograms
        print('Training data:')
        fig, ((ax1,ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(16,16), gridspec_kw={'height_ratios': [4, 1, 1]})
        plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
        # Image 1
        ax1.imshow(np.dstack([r1, g1, b1]), 
                   extent=(np.min(im1_x), np.max(im1_x), np.min(im1_y), np.max(im1_y)))
        data_snow_pts1.plot(ax=ax1, markersize=15, color='cyan', label='snow')
        data_non_snow_pts1.plot(ax=ax1, markersize=15, color='orange', label='non-snow')
        ax1.legend(loc='lower right')
        ax1.set_xlabel('Easting [m]')
        ax1.set_ylabel('Northing [m]')
        ax1.set_title(im1_date)
        ax3.hist(b1[b1>0].flatten(), color='blue', histtype='step', linewidth=2, bins=100, label='blue')
        ax3.hist(g1[g1>0].flatten(), color='green', histtype='step', linewidth=2, bins=100, label='green')
        ax3.hist(r1[r1>0].flatten(), color='red', histtype='step', linewidth=2, bins=100, label='red')
        ax3.hist(nir1[nir1>0].flatten(), color='brown', histtype='step', linewidth=2, bins=100, label='NIR')
        ax3.set_xlabel('Surface reflectance')
        ax3.set_ylabel('Pixel counts')
        ax3.grid()
        ax3.legend(loc='right')
        ax5.hist(ndsi1.flatten(), bins=100)
        ax5.set_xlabel('NDSI')
        ax5.set_ylabel('Pixel counts')
        ax5.grid()
        # Image 2
        ax2.imshow(np.dstack([r2, g2, b2]), 
                   extent=(np.min(im2_x), np.max(im2_x), np.min(im2_y), np.max(im2_y)))
        data_snow_pts2.plot(ax=ax2, markersize=15, color='cyan', label='snow')
        data_non_snow_pts2.plot(ax=ax2, markersize=15, color='orange', label='non-snow')
        ax2.legend(loc='lower right')
        ax2.set_xlabel('Easting [m]')
        ax2.set_title(im2_date)
        ax4.hist(b2[b2>0].flatten(), color='blue', histtype='step', linewidth=2, bins=100, label='blue')
        ax4.hist(g2[g2>0].flatten(), color='green', histtype='step', linewidth=2, bins=100, label='green')
        ax4.hist(r2[r2>0].flatten(), color='red', histtype='step', linewidth=2, bins=100, label='red')
        ax4.hist(nir2[nir2>0].flatten(), color='brown', histtype='step', linewidth=2, bins=100, label='NIR')
        ax4.set_xlabel('Surface reflectance')
        ax4.grid()
        ax6.hist(ndsi2.flatten(), bins=100)
        ax6.set_xlabel('NDSI')
        ax6.grid()
        plt.show()
        
        # Set up training data
        # Add date and snow classification column to data points
        data_snow_pts1['date'] = im1_date
        data_snow_pts1['snow'] = 1
        data_non_snow_pts1['date'] = im1_date
        data_non_snow_pts1['snow'] = 0
        data_snow_pts2['date'] = im2_date
        data_snow_pts2['snow'] = 1
        data_non_snow_pts2['date'] = im2_date
        data_non_snow_pts2['snow'] = 0
        # Merge snow and non-snow points
        data_pts = pd.concat([data_snow_pts1, data_non_snow_pts1, data_snow_pts2, data_non_snow_pts2], ignore_index=True)
        # Add coords column
        data_pts['coords'] = [(pt.bounds[0], pt.bounds[1]) for pt in data_pts['geometry']]
        # remove "id" and "geometry" columns
        data_pts = data_pts.drop(columns=['id', 'geometry'])
        # Sample band values at points
        data_pts['blue'] = ' '
        data_pts['green'] = ' '
        data_pts['red'] = ' '
        data_pts['NIR'] = ' '
        data_pts['blue'].loc[data_pts['date']==im1_date] = [x[0] for x in im1.sample(data_pts['coords'].loc[data_pts['date']==im1_date])]
        data_pts['green'].loc[data_pts['date']==im1_date] = [x[1] for x in im1.sample(data_pts['coords'].loc[data_pts['date']==im1_date])]
        data_pts['red'].loc[data_pts['date']==im1_date] = [x[2] for x in im1.sample(data_pts['coords'].loc[data_pts['date']==im1_date])]
        data_pts['NIR'].loc[data_pts['date']==im1_date] = [x[3] for x in im1.sample(data_pts['coords'].loc[data_pts['date']==im1_date])]
        data_pts['blue'].loc[data_pts['date']==im2_date] = [x[0] for x in im2.sample(data_pts['coords'].loc[data_pts['date']==im2_date])]
        data_pts['green'].loc[data_pts['date']==im2_date] = [x[1] for x in im2.sample(data_pts['coords'].loc[data_pts['date']==im2_date])]
        data_pts['red'].loc[data_pts['date']==im2_date] = [x[2] for x in im2.sample(data_pts['coords'].loc[data_pts['date']==im2_date])]
        data_pts['NIR'].loc[data_pts['date']==im2_date] = [x[3] for x in im2.sample(data_pts['coords'].loc[data_pts['date']==im2_date])]
        # Divide values by im_scalar if applicable
        if apply_scalar:
            data_pts['blue'] = data_pts['blue'].div(im_scalar)
            data_pts['green'] = data_pts['green'].div(im_scalar)
            data_pts['red'] = data_pts['red'].div(im_scalar)
            data_pts['NIR'] = data_pts['NIR'].div(im_scalar)
        # Add NDSI column
        data_pts['NDSI'] = (data_pts['red'] - data_pts['NIR']) / (data_pts['red'] + data_pts['NIR'])
        # Remove rows containing NaN
        data_pts = data_pts.dropna()
        
    else:
        
        # Image 1
        im1_fn = im_fns[i] # file name
        im1_date = im1_fn[0:4]+'-'+im1_fn[4:6]+'-'+im1_fn[6:8] # image capture date
        im1 = rio.open(im_path+im1_fn) # open image 1
        epsg = str(im1.crs)[5:] # grab EPSG code
        # read bands
        b1 = im1.read(1).astype(float) 
        r1 = im1.read(2).astype(float) 
        g1 = im1.read(3).astype(float) 
        nir1 = im1.read(4).astype(float) 
        # divide by image scalar if max band values > 1000
        if np.nanmax(b1) > 1e3:
            apply_scalar = True
            im_scalar = 10000
            b1 = b1 / im_scalar
            g1 = g1 / im_scalar
            r1 = r1 / im_scalar
            nir1 = nir1 / im_scalar
        else:
            apply_scalar = False
        # calculate NDSI
        ndsi1 = (r1 - nir1) / (r1 + nir1)
        # define coordinates grid
        im1_x = np.linspace(im1.bounds.left, im1.bounds.right, num=np.shape(b1)[1])
        im1_y = np.linspace(im1.bounds.top, im1.bounds.bottom, num=np.shape(b1)[0])
        # load snow training points
        data_snow_pts1_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_snow_points_'+im1_fn[:-8]+'.shp'
        data_snow_pts1 = gpd.read_file(data_snow_pts1_fn) 
        data_snow_pts1 = data_snow_pts1.to_crs(epsg) # reproject to image CRS
        # load non-snow points
        data_non_snow_pts1_fn = base_path+'../study-sites/'+site_names[i]+'/classified-points/'+site_IDs[i]+'_non_snow_points_'+im1_fn[:-8]+'.shp'
        data_non_snow_pts1 = gpd.read_file(data_non_snow_pts1_fn)
        data_non_snow_pts1 = data_non_snow_pts1.to_crs(epsg) # reproject to image CRS     
        
        # Plot RGB images, data point locations, and band histograms
        print('Training data:')
        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10,16), gridspec_kw={'height_ratios': [4, 1, 1]})
        plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
        # Image 1
        ax1.imshow(np.dstack([r1, g1, b1]), 
                   extent=(np.min(im1_x), np.max(im1_x), np.min(im1_y), np.max(im1_y)))
        data_snow_pts1.plot(ax=ax1, markersize=15, color='cyan', label='snow')
        data_non_snow_pts1.plot(ax=ax1, markersize=15, color='orange', label='non-snow')
        ax1.legend(loc='lower right')
        ax1.set_xlabel('Easting [m]')
        ax1.set_ylabel('Northing [m]')
        ax1.set_title(im1_date)
        ax2.hist(b1[b1>0].flatten(), color='blue', histtype='step', linewidth=2, bins=100, label='blue')
        ax2.hist(g1[g1>0].flatten(), color='green', histtype='step', linewidth=2, bins=100, label='green')
        ax2.hist(r1[r1>0].flatten(), color='red', histtype='step', linewidth=2, bins=100, label='red')
        ax2.hist(nir1[nir1>0].flatten(), color='brown', histtype='step', linewidth=2, bins=100, label='NIR')
        ax2.set_xlabel('Surface reflectance')
        ax2.set_ylabel('Pixel counts')
        ax2.grid()
        ax2.legend(loc='right')
        ax3.hist(ndsi1.flatten(), bins=100)
        ax3.set_xlabel('NDSI')
        ax3.set_ylabel('Pixel counts')
        ax3.grid()
        plt.show()
        
        # -----Set up training data
        # Add date and snow classification column to data points
        data_snow_pts1['date'] = im1_date
        data_snow_pts1['snow'] = 1
        data_non_snow_pts1['date'] = im1_date
        data_non_snow_pts1['snow'] = 0
        # Merge snow and non-snow points
        data_pts = pd.concat([data_snow_pts1, data_non_snow_pts1], ignore_index=True)
        # Add coords column
        data_pts['coords'] = [(pt.bounds[0], pt.bounds[1]) for pt in data_pts['geometry']]
        # remove "id" and "geometry" columns
        data_pts = data_pts.drop(columns=['id', 'geometry'])
        # Sample band values at points
        data_pts['blue'] = ' '
        data_pts['green'] = ' '
        data_pts['red'] = ' '
        data_pts['NIR'] = ' '
        data_pts['blue'] = [x[0] for x in im1.sample(data_pts['coords'])]
        data_pts['green'] = [x[1] for x in im1.sample(data_pts['coords'])]
        data_pts['red'] = [x[2] for x in im1.sample(data_pts['coords'])]
        data_pts['NIR'] = [x[3] for x in im1.sample(data_pts['coords'])]
        # Divide values by im_scalar if applicable
        if apply_scalar:
            data_pts['blue'] = data_pts['blue'].div(im_scalar)
            data_pts['green'] = data_pts['green'].div(im_scalar)
            data_pts['red'] = data_pts['red'].div(im_scalar)
            data_pts['NIR'] = data_pts['NIR'].div(im_scalar)
        # Add NDSI column
        data_pts['NDSI'] = (data_pts['red'] - data_pts['NIR']) / (data_pts['red'] + data_pts['NIR'])
        # Remove rows containing NaN
        data_pts = data_pts.dropna()
        
    # -----Add data points to full data points DF
    data_pts_full = pd.concat([data_pts_full, data_pts], ignore_index=True)

    # -----Test supervised classification algorithms
    # Split data points into features (band values) and target variable (snow)
    feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI']
    X = data_pts[feature_cols] # features
    y = data_pts['snow'] # target variable

    # Split data points into testing and training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Iterate over classifiers
    j = 0 # loop counter
    accuracy = [] # classifier overall accuracy
    K = [] # classifier kappa score
    TP, TN, FP, FN = [], [], [], [] # confusion matrix components
    for name, clf in zip(names, classifiers):
            
        # train classifier
        clf.fit(X_train, y_train)

        # Calculate statistics
        # overall accuracy
        y_pred = clf.predict(X_test)
        accuracy = accuracy + [metrics.accuracy_score(y_test, y_pred)]
        # Kappa score
        K = K + [metrics.cohen_kappa_score(y_test, y_pred)]
        # confusion matrix
        TP = TP + [np.where((y_test==1) & (y_pred==1))[0].size]
        TN = TN + [np.where((y_test==0) & (y_pred==0))[0].size]
        FP = FP + [np.where((y_test==0) & (y_pred==1))[0].size]
        FN = FN + [np.where((y_test==1) & (y_pred==0))[0].size] 
    
        j+=1

    # Determine best classifier based on accuracy
    results = pd.DataFrame()
    results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
    results['TP'], results['TN'], results['FP'], results['FN'] = TP, TN, FP, FN
    clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
    clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
    print(results)
    print('')
    print('Best accuracy classifier: ' + clf_best_name)

    # -----Save most accurate classifier
    if save_outputs==True:
        clf_fn = out_path+site_IDs[i]+'_classifier.sav'
        pickle.dump(clf_best, open(clf_fn, 'wb'))
        print('Most accurate classifier saved to file: ',clf_fn)
        feature_cols_fn = out_path+site_IDs[i]+'_classifier_feature_cols.pkl'
        pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
        print('Feature columns saved to file: ',feature_cols_fn)

### Test one classifier for all sites

In [None]:
# -----Test supervised classification algorithms
# Split data points into features (band values) and target variable (snow)
feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI']
X = data_pts_full[feature_cols] # features
y = data_pts_full['snow'] # target variable

# Split data points into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Iterate over classifiers
j = 0 # loop counter
accuracy = [] # classifier overall accuracy
K = [] # classifier kappa score
TP, TN, FP, FN = [], [], [], [] # confusion matrix components
for name, clf in zip(names, classifiers):

    # train classifier
    clf.fit(X_train, y_train)

    # Calculate statistics
    # overall accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy + [metrics.accuracy_score(y_test, y_pred)]
    # Kappa score
    K = K + [metrics.cohen_kappa_score(y_test, y_pred)]
    # confusion matrix
    TP = TP + [np.where((y_test==1) & (y_pred==1))[0].size]
    TN = TN + [np.where((y_test==0) & (y_pred==0))[0].size]
    FP = FP + [np.where((y_test==0) & (y_pred==1))[0].size]
    FN = FN + [np.where((y_test==1) & (y_pred==0))[0].size] 

    j+=1

# Determine best classifier based on accuracy
results = pd.DataFrame()
results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
results['TP'], results['TN'], results['FP'], results['FN'] = TP, TN, FP, FN
clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
print(results)
print('')
print('Best accuracy classifier: ' + clf_best_name)

# -----Save most accurate classifier
if save_outputs==True:
    clf_fn = out_path+'all_sites_classifier.sav'
    pickle.dump(clf_best, open(clf_fn, 'wb'))
    print('Most accurate classifier saved to file: ',clf_fn)
    feature_cols_fn = out_path+'all_sites_classifier_feature_cols.pkl'
    pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
    print('Feature columns saved to file: ',feature_cols_fn)

### Test unsupervised classification algorithms

In [None]:
# -----KMeans
# image 1
# I1_real = ~np.isnan(b1)
# im1_x_mesh, im1_y_mesh = np.meshgrid(im1_x, im1_y)
# im1_x_real = im1_x_mesh[I1_real]
# im1_y_real = im1_y_mesh[I1_real]
# b1_real = b1[I1_real].flatten()
# g1_real = g1[I1_real].flatten()
# r1_real = r1[I1_real].flatten()
# nir1_real = nir1[I1_real].flatten()
# ndsi1_real = ((r1_real - nir1_real)/(r1_real + nir1_real))
# X1 = np.column_stack((b1_real, g1_real, r1_real, nir1_real, ndsi1_real))

# # image 2
# I2_real = ~np.isnan(b2)
# im2_x_mesh, im2_y_mesh = np.meshgrid(im2_x, im2_y)
# im2_x_real = im2_x_mesh[I2_real]
# im2_y_real = im2_y_mesh[I2_real]
# b2_real = b2[I2_real].flatten()
# g2_real = g2[I2_real].flatten()
# r2_real = r2[I2_real].flatten()
# nir2_real = nir2[I2_real].flatten()
# ndsi2_real = ((r2_real - nir2_real)/(r2_real + nir2_real))
# X2 = np.column_stack((b2_real, g2_real, r2_real, nir2_real, ndsi2_real))

# # generate classifier and classify images
# n = 3 # number of clusters
# Y1 = KMeans(n_clusters=n).fit(X1)
# labels1 = Y1.labels_
# Y2 = KMeans(n_clusters=n).fit(X2)
# labels2 = Y2.labels_

# # reshape from flat array to original shape
# clusters1 = np.zeros((np.shape(b1)[0], np.shape(b1)[1]))
# clusters1[:] = np.nan
# clusters1[I1_real] = labels1
# clusters2 = np.zeros((np.shape(b2)[0], np.shape(b2)[1]))
# clusters2[:] = np.nan
# clusters2[I2_real] = labels2

# fig, ax = plt.subplots(2, 2, figsize=(12,12))
# ax[0,0].imshow(np.dstack([r1, g1, b1]))
# ax[0,1].imshow(clusters1)
# ax[1,0].imshow(np.dstack([r2, g2, b2]))
# ax[1,1].imshow(clusters2)
# plt.show()