## Notebook to develop supervised classification algorithm for identifying snow in PlanetScope 4-band imagery
Rainey Aberle

Adapted from the [SciKit Learn Classifier comparison tutorial](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)

### Initial setup

In [None]:
# -----Import packages
import os
import glob
import numpy as np
import rasterio as rio
import geopandas as gpd
import pandas as pd
import ee
import richdem as rd
import scipy
from shapely.geometry import Polygon
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics 
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import matplotlib
import sys
import pickle

In [None]:
# -----Determine whether to save outputs to file
save_outputs = True # = True to save output figures and best classifier 

# -----Define paths in directory
# base directory (path to planet-snow/)
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/planet-snow/'
# output folder for best classifier
out_path = base_path+'inputs-outputs/'
# path to classified points used to train and test classifiers
data_pts_path = base_path+'../classified-points/'

# -----Determine whether to use terrain parameters (elevation, slope, aspect) in classification
terrain_parameters = False

# -----Add path to functions
sys.path.insert(1, base_path+'functions/')
import ps_pipeline_utils as f

### Authenticate and initialize Google Earth Engine (GEE)

__Note:__ The first time you run the following cell, you will be asked to authenticate your GEE account for use in this notebook. This will send you to an external web page, where you will walk through the GEE authentication workflow and copy an authentication code back in this notebook when prompted. 

In [None]:
if terrain_parameters==True:
    try:
        ee.Initialize()
    except: 
        ee.Authenticate()
        ee.Initialize()
    print('GEE authenticated and initialized.')
else:
    print('Terrain parameters set to false. No need to authenticate GEE.')

### Define supervised classification algorithms to test 

In [None]:
# -----Classifier names
names = [
#     "Gaussian Process", # keeps crashing kernel when classifying images!
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression"
]

# -----Classifiers
classifiers = [
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(random_state = 0)
]

# -----Classified points
os.chdir(data_pts_path)
data_pts_fns = glob.glob('*.shp')
data_pts_fns.sort()
data_pts_fns

### Test site-specific classifiers

Use classified points at each site to determine the best site-specific classifier. Calculate classifier accuracy using K-fold cross-validation with 10 folds.  

In [None]:
import warnings
warnings.filterwarnings('ignore')

# -----Define site names
site_names = ['Gulkana', 'SCascade', 'Sperry', 'Wolverine']

# -----Define number of Monte Carlo simulations to assess accuracy
num_mc_sims = 100

# -----Initialize data_pts_full to save info for single classifier (next step)
data_pts_full = gpd.GeoDataFrame()

# -----Define feature columns (classes used)
if terrain_parameters==True:
    feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI', 'elevation', 'slope', 'aspect', 'moy']
else:
    feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI']

In [None]:
# -----Loop through sites
for i in range(len(site_names)):
    
    print('----------')
    print(site_names[i])
    print('----------')

    # -----Set up training data
    # determine number of images used for classified points
    num_images = len([s for s in data_pts_fns if (site_names[i] in s) and ('snow.shp' in s)])
    im_dates = [s[len(site_names[i])+1:len(site_names[i])+9] for s in data_pts_fns if (site_names[i] in s) and ('snow.shp' in s)]
    # loop through each image
    for j in range(0, num_images):
        # determine image date
        im_date = im_dates[j]
        # load classified points
        data_pts = pd.DataFrame() # dataframe to hold applicable data classes
        # snow
        if len([s for s in data_pts_fns if (site_names[i] in s) and ('snow.shp' in s) and (im_date in s)])>0: # check if class exists for site and date
            data_pts_snow_fn = [s for s in data_pts_fns if (site_names[i] in s) and ('snow.shp' in s) and (im_date in s)][0]
            data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
            data_pts_snow['class'] = 1 # determine class ID
            data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
            print(data_pts_snow_fn)
        # shadowed snow
        if len([s for s in data_pts_fns if (site_names[i] in s) and ('snow-shadowed.shp' in s)  and (im_date in s)])>0: # check if class exists for site and date
            data_pts_snow_sh_fn = [s for s in data_pts_fns if (site_names[i] in s) and ('snow-shadowed.shp' in s) and (im_date in s)][0]
            data_pts_snow_sh = gpd.read_file(data_pts_path + data_pts_snow_sh_fn) # read file
            data_pts_snow_sh['class'] = 2 # determine class ID
            data_pts = pd.concat([data_pts, data_pts_snow_sh], ignore_index=True) # concatenate to full data points df
            print(data_pts_snow_sh_fn)
        # ice
        if len([s for s in data_pts_fns if (site_names[i] in s) and ('ice.shp' in s) and (im_date in s)])>0: # check if class exists for site and date
            data_pts_ice_fn = [s for s in data_pts_fns if (site_names[i] in s) and ('ice.shp' in s)  and (im_date in s)][0]
            data_pts_ice = gpd.read_file(data_pts_path + data_pts_ice_fn)  # read file
            data_pts_ice['class'] = 3 # determine class ID
            data_pts = pd.concat([data_pts, data_pts_ice], ignore_index=True) # concatenate to full data points df
            print(data_pts_ice_fn)
        # rock
        if len([s for s in data_pts_fns if (site_names[i] in s) and ('rock.shp' in s) and (im_date in s)])>0: # check if class exists for site and date
            data_pts_rock_fn = [s for s in data_pts_fns if (site_names[i] in s) and ('rock.shp' in s)  and (im_date in s)][0]
            data_pts_rock = gpd.read_file(data_pts_path + data_pts_rock_fn) # read file
            data_pts_rock['class'] = 4 # determine class ID
            data_pts = pd.concat([data_pts, data_pts_rock], ignore_index=True) # concatenate to full data points df
            print(data_pts_rock_fn)
        # water
        if len([s for s in data_pts_fns if (site_names[i] in s) and ('water.shp' in s)  and (im_date in s)])>0: # check if class exists for site and date
            data_pts_water_fn = [s for s in data_pts_fns if (site_names[i] in s) and ('water.shp' in s) and (im_date in s)][0]
            data_pts_water = gpd.read_file(data_pts_path + data_pts_water_fn) # read file
            data_pts_water['class'] = 5 # determine class ID
            data_pts = pd.concat([data_pts, data_pts_water], ignore_index=True) # concatenate to full data points df
            print(data_pts_water_fn)
    
        # -----Load image
        im_path = base_path+'../study-sites/'+site_names[i]+'/imagery/PlanetScope/adjusted-filtered/'
        Idate = data_pts_snow_fn.index('_')+1
        im_fn = data_pts_snow_fn[Idate:Idate+11]+'_adj.tif' # image file name
        im_date = im_fn[0:4]+'-'+im_fn[4:6]+'-'+im_fn[6:8] # image capture date
        im = rio.open(im_path+im_fn) # open image
        epsg = int(str(im.crs)[5:]) # grab EPSG code
        # read bands
        b = im.read(1) 
        r = im.read(2) 
        g = im.read(3)
        nir = im.read(4)
        # divide by image scalar if max band values > 1000
        if np.nanmax(b) > 1e3:
            apply_scalar = True
            im_scalar = 10000
            b = b / im_scalar
            g = g / im_scalar
            r = r / im_scalar
            nir = nir / im_scalar
        else:
            apply_scalar = False
        # set no-data values to NaN
        b[b==-9999] = np.nan
        g[g==-9999] = np.nan
        r[r==-9999] = np.nan
        nir[nir==-9999] = np.nan
        # calculate NDSI
        ndsi = (r - nir) / (r + nir)        
        # define coordinates grid
        im_x = np.linspace(im.bounds.left, im.bounds.right, num=np.shape(b)[1])
        im_y = np.linspace(im.bounds.top, im.bounds.bottom, num=np.shape(b)[0])
    
        # -----Reformat data points coordinates
        # reproject to image epsg
        data_pts = data_pts.to_crs(epsg) 
        # remove "id" column
        data_pts = data_pts.drop(columns=['id'])
        # remove rows containing NaN
        data_pts = data_pts.dropna()
        data_pts = data_pts.reset_index()
        # add coords column
        data_pts['coords'] = [(pt.bounds[0], pt.bounds[1]) for pt in data_pts['geometry']]
        # add site_name column
        data_pts['site_name'] = site_names[i]
        # sample band values at points
        data_pts['blue'] = [x[0] for x in im.sample(data_pts['coords'])]
        data_pts['green'] = [x[1] for x in im.sample(data_pts['coords'])]
        data_pts['red'] = [x[2] for x in im.sample(data_pts['coords'])]
        data_pts['NIR'] = [x[3] for x in im.sample(data_pts['coords'])]
        # divide values by im_scalar if applicable
        if apply_scalar:
            data_pts['blue'] = data_pts['blue'].div(im_scalar)
            data_pts['green'] = data_pts['green'].div(im_scalar)
            data_pts['red'] = data_pts['red'].div(im_scalar)
            data_pts['NIR'] = data_pts['NIR'].div(im_scalar)
        # add NDSI column
        data_pts['NDSI'] = (data_pts['red'] - data_pts['NIR']) / (data_pts['red'] + data_pts['NIR'])
        if terrain_parameters==True:
            # -----Load DEM
            x = im.bounds.left, im.bounds.right, im.bounds.right, im.bounds.left, im.bounds.left
            y = im.bounds.bottom, im.bounds.bottom, im.bounds.top, im.bounds.top, im.bounds.bottom
            coords = list(zip(x,y))
            bb_gdf = gpd.GeoDataFrame({'geometry': [Polygon(coords)]}, crs=im.crs)
            DEM, DEM_x, DEM_y, AOI_UTM = f.query_GEE_for_DEM(bb_gdf, im_path, im_fn)
            # flatten DEM to 2D
            DEM = DEM.reshape((DEM.shape[0], DEM.shape[1]))
            DEM_rd = rd.rdarray(DEM, no_data=-9999) # rich DEM array of DEM
            # calculate slope and aspect using DEM
            slope = rd.TerrainAttribute(DEM_rd, attrib='slope_degrees')
            aspect = rd.TerrainAttribute(DEM_rd, attrib='aspect')
            # convert from rdarray to numpy array
            slope, aspect = np.array(slope).astype(int), np.array(aspect).astype(int)
            # interpolate elevation at coords
            f_DEM = scipy.interpolate.interp2d(DEM_x, DEM_y, DEM)
            data_pts['elevation'] = [f_DEM(x[0], x[1])[0] for x in data_pts['coords']]
            # interpolate slope at coords
            f_slope = scipy.interpolate.interp2d(DEM_x, DEM_y, slope) 
            data_pts['slope'] = [f_slope(x[0], x[1])[0] for x in data_pts['coords']] 
            # interpolate aspect at coords
            f_aspect = scipy.interpolate.interp2d(DEM_x, DEM_y, aspect) 
            data_pts['aspect'] = [f_aspect(x[0], x[1])[0] for x in data_pts['coords']]
            # add month-of-year (moy) column
            data_pts['moy'] = float(im_fn[4:6]) 
            
        # concatenate to full DataFrame
        data_pts_full = pd.concat([data_pts_full, data_pts], ignore_index=True)
        
    # -----Test supervised classification algorithms
    # Split data points into features (band values / terrain parameters) and target variable (class)
    X = data_pts[feature_cols] # features
    y = data_pts['class'] # target variable
    
    # Iterate over classifiers
    accuracy = np.zeros(len(classifiers)) # mean accuracy
    K = np.zeros(len(classifiers)) # mean Kappa score
    j=0
    for name, clf in zip(names, classifiers):
    
        # Conduct K-Fold cross-validation
        num_folds = 10
        kfold = KFold(n_splits=num_folds, shuffle=True, random_state=1)
        accuracy_folds = np.zeros(num_folds) # accuracy for all simulations
        K_folds = np.zeros(num_folds) # kappa score for all MC simulations
        k=0 # iteration counter
        # enumerate the splits and summarize the distributions
        for train_ix, test_ix in kfold.split(X):
            
            # select rows
            X_train, X_test = X.loc[train_ix], X.loc[test_ix]
            y_train, y_test = y[train_ix], y[test_ix]
            
            # Train classifier
            clf.fit(X_train, y_train)

            # Predict class values using trained classifier
            y_pred = clf.predict(X_test)

            # Calculate overall accuracy
            accuracy_folds[k] = metrics.accuracy_score(y_test, y_pred)
            # Calculate Kappa score
            K_folds[k] = metrics.cohen_kappa_score(y_test, y_pred)
            
            k+=1
        
        # Calculate mean accuracy and Kappa score
        accuracy[j] = np.nanmean(accuracy_folds)
        K[j] = np.nanmean(K_folds)
        
        j+=1

    # Determine best classifier based on accuracy
    results = pd.DataFrame()
    results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
    clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
    clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
    print(results)
    print('')
    print('Best accuracy classifier: ' + clf_best_name)

    # -----Save most accurate classifier
    if save_outputs==True:
        clf_fn = out_path + 'PS_classifier_'+site_names[i]+'.sav'
        pickle.dump(clf_best, open(clf_fn, 'wb'))
        print('Most accurate classifier saved to file: ',clf_fn)
        feature_cols_fn = out_path + 'PS_classifier_'+site_names[i]+'_feature_cols.pkl'
        pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
        print('Feature columns saved to file: ', feature_cols_fn)

### Test one classifier for all sites

In [None]:
from sklearn.model_selection import KFold

# -----Test supervised classification algorithms
X = data_pts_full[feature_cols] # features
y = data_pts_full['class'] # target variable
    
# Iterate over classifiers
accuracy = np.zeros(len(classifiers)) # mean accuracy
K = np.zeros(len(classifiers)) # mean Kappa score
j=0
for name, clf in zip(names, classifiers):
    
    print(name)

    # Conduct K-Fold cross-validation
    num_folds = 10
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=1)
    accuracy_folds = np.zeros(num_folds) # accuracy for all simulations
    K_folds = np.zeros(num_folds) # kappa score for all MC simulations
    k=0 # iteration counter
    # enumerate the splits and summarize the distributions
    for train_ix, test_ix in kfold.split(X):

        # select rows
        X_train, X_test = X.loc[train_ix], X.loc[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]

        # Train classifier
        clf.fit(X_train, y_train)

        # Predict class values using trained classifier
        y_pred = clf.predict(X_test)

        # Calculate overall accuracy
        accuracy_folds[k] = metrics.accuracy_score(y_test, y_pred)
        # Calculate Kappa score
        K_folds[k] = metrics.cohen_kappa_score(y_test, y_pred)

        k+=1
        
    # Calculate mean accuracy and Kappa score
    accuracy[j] = np.nanmean(accuracy_folds)
    K[j] = np.nanmean(K_folds)

    j+=1

# Determine best classifier based on accuracy
results = pd.DataFrame()
results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
#     results['TP'], results['TN'], results['FP'], results['FN'] = TP, TN, FP, FN
clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
print(results)
print('')
print('Best accuracy classifier: ' + clf_best_name)

# -----Save most accurate classifier
if save_outputs==True:
    clf_fn = out_path+'PS_classifier_all_sites.sav'
    pickle.dump(clf_best, open(clf_fn, 'wb'))
    print('Most accurate classifier saved to file: ',clf_fn)
    feature_cols_fn = out_path+'PS_classifier_all_sites_feature_cols.pkl'
    pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
    print('Feature columns saved to file: ',feature_cols_fn)

### Plot spectral signatures of each class

In [None]:
xticks = np.array([1, 2, 3, 4, 5], dtype=float)
xtick_labels = ['Snow', 'Shadowed snow', 'Ice', 'Rock', 'Water']
marker_size = 5
fig1, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
# blue 
data_b = [data_pts_full.loc[data_pts_full['class']==1]['blue'], 
          data_pts_full.loc[data_pts_full['class']==2]['blue'],
          data_pts_full.loc[data_pts_full['class']==3]['blue'],
          data_pts_full.loc[data_pts_full['class']==4]['blue'], 
          data_pts_full.loc[data_pts_full['class']==5]['blue']]
ax1.boxplot(data_b)
ax1.set_xticks(xticks)
ax1.set_xticklabels(xtick_labels)
ax1.set_title('Blue')
# green
data_g = [data_pts_full.loc[data_pts_full['class']==1]['green'], 
          data_pts_full.loc[data_pts_full['class']==2]['green'],
          data_pts_full.loc[data_pts_full['class']==3]['green'],
          data_pts_full.loc[data_pts_full['class']==4]['green'], 
          data_pts_full.loc[data_pts_full['class']==5]['green']]
ax2.boxplot(data_g)
ax2.set_xticks(xticks)
ax2.set_xticklabels(xtick_labels)
ax2.set_title('Green')
# red
data_r = [data_pts_full.loc[data_pts_full['class']==1]['red'], 
          data_pts_full.loc[data_pts_full['class']==2]['red'],
          data_pts_full.loc[data_pts_full['class']==3]['red'],
          data_pts_full.loc[data_pts_full['class']==4]['red'],
          data_pts_full.loc[data_pts_full['class']==5]['red']]
ax3.boxplot(data_r)
ax3.set_xticks(xticks)
ax3.set_xticklabels(xtick_labels)
ax3.set_title('Red')
# NIR
data_nir = [data_pts_full.loc[data_pts_full['class']==1]['NIR'], 
            data_pts_full.loc[data_pts_full['class']==2]['NIR'],
            data_pts_full.loc[data_pts_full['class']==3]['NIR'],
            data_pts_full.loc[data_pts_full['class']==4]['NIR'],
            data_pts_full.loc[data_pts_full['class']==5]['NIR']]
ax4.boxplot(data_nir)
ax4.set_xticks(xticks)
ax4.set_xticklabels(xtick_labels)
ax4.set_title('NIR')

fig2, ax = plt.subplots(1, 1, figsize=(8, 6))
# NDSI
data_ndsi = [data_pts_full.loc[data_pts_full['class']==1]['NDSI'], 
             data_pts_full.loc[data_pts_full['class']==2]['NDSI'],
             data_pts_full.loc[data_pts_full['class']==3]['NDSI'],
             data_pts_full.loc[data_pts_full['class']==4]['NDSI'], 
             data_pts_full.loc[data_pts_full['class']==5]['NDSI']]
ax.boxplot(data_ndsi)
ax.set_xticks(xticks)
ax.set_xticklabels(xtick_labels)
ax.set_ylim(-1, 1)
ax.set_title('NDSI')
plt.show()

# -----Plot terrain parameters of each class
if terrain_parameters==True:
    xticks = np.array([1, 2, 3, 4, 5], dtype=float)
    xtick_labels = ['Snow', 'Shadowed snow', 'Ice', 'Rock', 'Water']
    marker_size = 5
    fig3, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))
    plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
    # elevation 
    data_elev = [data_pts_full.loc[data_pts_full['class']==1]['elevation'], 
              data_pts_full.loc[data_pts_full['class']==2]['elevation'],
              data_pts_full.loc[data_pts_full['class']==3]['elevation'],
              data_pts_full.loc[data_pts_full['class']==4]['elevation'], 
              data_pts_full.loc[data_pts_full['class']==5]['elevation']]
    ax1.boxplot(data_elev)
    ax1.set_xticks(xticks)
    ax1.set_xticklabels(xtick_labels)
    ax1.set_title('Elevation')
    # slope
    data_slope = [data_pts_full.loc[data_pts_full['class']==1]['slope'], 
              data_pts_full.loc[data_pts_full['class']==2]['slope'],
              data_pts_full.loc[data_pts_full['class']==3]['slope'],
              data_pts_full.loc[data_pts_full['class']==4]['slope'], 
              data_pts_full.loc[data_pts_full['class']==5]['slope']]
    ax2.boxplot(data_slope)
    ax2.set_xticks(xticks)
    ax2.set_xticklabels(xtick_labels)
    ax2.set_title('slope')
    # aspect
    data_aspect = [data_pts_full.loc[data_pts_full['class']==1]['aspect'], 
              data_pts_full.loc[data_pts_full['class']==2]['aspect'],
              data_pts_full.loc[data_pts_full['class']==3]['aspect'],
              data_pts_full.loc[data_pts_full['class']==4]['aspect'],
              data_pts_full.loc[data_pts_full['class']==5]['aspect']]
    ax3.boxplot(data_aspect)
    ax3.set_xticks(xticks)
    ax3.set_xticklabels(xtick_labels)
    ax3.set_title('aspect')

    plt.show()

### Test unsupervised classification algorithms

In [None]:
# -----KMeans
# image 1
# I1_real = ~np.isnan(b1)
# im1_x_mesh, im1_y_mesh = np.meshgrid(im1_x, im1_y)
# im1_x_real = im1_x_mesh[I1_real]
# im1_y_real = im1_y_mesh[I1_real]
# b1_real = b1[I1_real].flatten()
# g1_real = g1[I1_real].flatten()
# r1_real = r1[I1_real].flatten()
# nir1_real = nir1[I1_real].flatten()
# ndsi1_real = ((r1_real - nir1_real)/(r1_real + nir1_real))
# X1 = np.column_stack((b1_real, g1_real, r1_real, nir1_real, ndsi1_real))

# # image 2
# I2_real = ~np.isnan(b2)
# im2_x_mesh, im2_y_mesh = np.meshgrid(im2_x, im2_y)
# im2_x_real = im2_x_mesh[I2_real]
# im2_y_real = im2_y_mesh[I2_real]
# b2_real = b2[I2_real].flatten()
# g2_real = g2[I2_real].flatten()
# r2_real = r2[I2_real].flatten()
# nir2_real = nir2[I2_real].flatten()
# ndsi2_real = ((r2_real - nir2_real)/(r2_real + nir2_real))
# X2 = np.column_stack((b2_real, g2_real, r2_real, nir2_real, ndsi2_real))

# # generate classifier and classify images
# n = 3 # number of clusters
# Y1 = KMeans(n_clusters=n).fit(X1)
# labels1 = Y1.labels_
# Y2 = KMeans(n_clusters=n).fit(X2)
# labels2 = Y2.labels_

# # reshape from flat array to original shape
# clusters1 = np.zeros((np.shape(b1)[0], np.shape(b1)[1]))
# clusters1[:] = np.nan
# clusters1[I1_real] = labels1
# clusters2 = np.zeros((np.shape(b2)[0], np.shape(b2)[1]))
# clusters2[:] = np.nan
# clusters2[I2_real] = labels2

# fig, ax = plt.subplots(2, 2, figsize=(12,12))
# ax[0,0].imshow(np.dstack([r1, g1, b1]))
# ax[0,1].imshow(clusters1)
# ax[1,0].imshow(np.dstack([r2, g2, b2]))
# ax[1,1].imshow(clusters2)
# plt.show()