## Notebook to develop supervised classification algorithm for identifying snow in PlanetScope 4-band imagery
Rainey Aberle

Adapted from the [SciKit Learn Classifier comparison tutorial](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)

### Initial setup

In [None]:
# -----Import packages
import os
import glob
import numpy as np
import rasterio as rio
import geopandas as gpd
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics 
import matplotlib.pyplot as plt
import matplotlib
import time
import sys
import pickle

In [None]:
# -----Determine whether to save outputs to file
save_outputs = True # = True to save output figures and best classifier 

# -----Define paths in directory
# base directory (path to planet-snow/)
base_path = '/Users/raineyaberle/Research/PhD/Planet_snow_cover/planet-snow/'
# output folder for best classifier
out_path = base_path+'inputs-outputs/'
# path to classified points used to train and test classifiers
data_pts_path = base_path+'../classified-points/'

# -----Add path to functions
sys.path.insert(1, base_path+'functions/')
from classification_utils import crop_images_to_AOI, classify_image, calculate_SCA

### Define supervised classification algorithms to test 

In [None]:
# -----Classifier names
names = [
#     "Gaussian Process", # keeps crashing kernel when classifying images!
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression"
]

# -----Classifiers
classifiers = [
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(random_state = 0)
]

# -----Classified points
os.chdir(data_pts_path)
data_pts_fns = glob.glob('*.shp')
data_pts_fns

### Test site-specific classifiers

Use classified points at each site to determine the best site-specific classifier, calculate accuracy

Classified points split: 80% training, 20% testing

In [None]:
# -----Define site IDs
# Wolverine (WO), SitKusa (SK), Mendenhall (ME), Easton (EA), Blue (BL), Emmons (EM)
site_IDs = ['WO', 'SK', 'ME', 'EA']#, 'BL', 'EM']
site_names = ['Wolverine', 'SitKusa', 'Mendenhall', 'Easton']#, 'Blue', 'Emmons']

# -----Initialize data_pts for single classifier (next step)
data_pts_full = gpd.GeoDataFrame()

# -----Loop through sites
for i in range(len(site_names)):
    
    print('----------')
    print(site_names[i])

    # -----Set up training data
    # load classified points
    # snow
    data_pts_snow_fn = [x for x in data_pts_fns if (site_IDs[i] in x) and ('snow.shp' in x)][0]
    data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) 
    data_pts_snow['class'] = 1
    # ice
    data_pts_ice_fn = [x for x in data_pts_fns if (site_IDs[i] in x) and ('ice.shp' in x)][0]
    data_pts_ice = gpd.read_file(data_pts_path + data_pts_ice_fn)
    data_pts_ice['class'] = 2
    # rock
    data_pts_rock_fn = [x for x in data_pts_fns if (site_IDs[i] in x) and ('rock.shp' in x)][0]
    data_pts_rock = gpd.read_file(data_pts_path + data_pts_rock_fn)   
    data_pts_rock['class'] = 3
    # shadowed snow
    if site_names[i]!='Easton':
        data_pts_snow_sh_fn = [x for x in data_pts_fns if (site_IDs[i] in x) and ('snow-shadowed.shp' in x)][0]
        data_pts_snow_sh = gpd.read_file(data_pts_path + data_pts_snow_sh_fn)
        data_pts_snow_sh['class'] = 4
        # merge data points
        data_pts = pd.concat([data_pts_snow, data_pts_ice, data_pts_rock, data_pts_snow_sh], ignore_index=True)
    else:
        # merge data points
        data_pts = pd.concat([data_pts_snow, data_pts_ice, data_pts_rock], ignore_index=True)
    
    # -----Load image
    im_path = base_path+'../study-sites/'+site_names[i]+'/imagery/Planet/adjusted-filtered/'
    im_fn = data_pts_snow_fn[3:14]+'_adj.tif' # image file name
    im_date = im_fn[0:4]+'-'+im_fn[4:6]+'-'+im_fn[6:8] # image capture date
    im = rio.open(im_path+im_fn) # open image
    epsg = str(im.crs)[5:] # grab EPSG code
    # read bands
    b = im.read(1).astype(float) 
    r = im.read(2).astype(float) 
    g = im.read(3).astype(float) 
    nir = im.read(4).astype(float) 
    # divide by image scalar if max band values > 1000
    if np.nanmax(b) > 1e3:
        apply_scalar = True
        im_scalar = 10000
        b = b / im_scalar
        g = g / im_scalar
        r = r / im_scalar
        nir = nir / im_scalar
    else:
        apply_scalar = False
    # calculate NDSI
    ndsi = (r - nir) / (r + nir)
    # define coordinates grid
    im_x = np.linspace(im.bounds.left, im.bounds.right, num=np.shape(b)[1])
    im_y = np.linspace(im.bounds.top, im.bounds.bottom, num=np.shape(b)[0])
    
    # -----Reformat data points coordinates
    # reproject to image epsg
    data_pts = data_pts.to_crs(epsg) 
    # remove "id" column
    data_pts = data_pts.drop(columns=['id'])
    # remove rows containing NaN
    data_pts = data_pts.dropna()
    # add coords column
    data_pts['coords'] = [(pt.bounds[0], pt.bounds[1]) for pt in data_pts['geometry']]
    # sample band values at points
    data_pts['blue'] = [x[0] for x in im.sample(data_pts['coords'])]
    data_pts['green'] = [x[1] for x in im.sample(data_pts['coords'])]
    data_pts['red'] = [x[2] for x in im.sample(data_pts['coords'])]
    data_pts['NIR'] = [x[3] for x in im.sample(data_pts['coords'])]
    # divide values by im_scalar if applicable
    if apply_scalar:
        data_pts['blue'] = data_pts['blue'].div(im_scalar)
        data_pts['green'] = data_pts['green'].div(im_scalar)
        data_pts['red'] = data_pts['red'].div(im_scalar)
        data_pts['NIR'] = data_pts['NIR'].div(im_scalar)
    # add NDSI column
    data_pts['NDSI'] = (data_pts['red'] - data_pts['NIR']) / (data_pts['red'] + data_pts['NIR'])
    # add data points to full data points DF
    data_pts_full = pd.concat([data_pts_full, data_pts], ignore_index=True)
    
    # -----Plot RGB images, data point locations, and band histograms
#     print('Training data:')
#     fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8,16), gridspec_kw={'height_ratios': [4, 1, 1]})
#     plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
#     # Image 1
#     ax1.imshow(np.dstack([r, g, b]), 
#                extent=(np.min(im_x), np.max(im_x), np.min(im_y), np.max(im_y)))
#     data_pts.loc[data_pts['class']==1].plot(ax=ax1, markersize=15, color='cyan', label='snow')
#     data_pts.loc[data_pts['class']==2].plot(ax=ax1, markersize=15, color='blue', label='ice')
#     data_pts.loc[data_pts['class']==3].plot(ax=ax1, markersize=15, color='orange', label='rock')
#     data_pts.loc[data_pts['class']==4].plot(ax=ax1, markersize=15, color='grey', label='snow')
#     ax1.legend(loc='lower right')
#     ax1.set_xlabel('Easting [m]')
#     ax1.set_ylabel('Northing [m]')
#     ax1.set_title(im_date)
#     ax2.hist(b[b>0].flatten(), color='blue', histtype='step', linewidth=2, bins=100, label='blue')
#     ax2.hist(g[g>0].flatten(), color='green', histtype='step', linewidth=2, bins=100, label='green')
#     ax2.hist(r[r>0].flatten(), color='red', histtype='step', linewidth=2, bins=100, label='red')
#     ax2.hist(nir[nir>0].flatten(), color='brown', histtype='step', linewidth=2, bins=100, label='NIR')
#     ax2.set_xlabel('Surface reflectance')
#     ax2.set_ylabel('Pixel counts')
#     ax2.grid()
#     ax3.legend(loc='right')
#     ax3.hist(ndsi.flatten(), bins=100)
#     ax3.set_xlabel('NDSI')
#     ax3.set_ylabel('Pixel counts')
#     ax3.grid()
#     plt.show()
        
    # -----Test supervised classification algorithms
    # Split data points into features (band values) and target variable (snow)
    feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI']
    X = data_pts[feature_cols] # features
    y = data_pts['class'] # target variable

    # Split data points into testing and training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Iterate over classifiers
    j = 0 # loop counter
    accuracy = [] # classifier overall accuracy
    K = [] # classifier kappa score
    TP, TN, FP, FN = [], [], [], [] # confusion matrix components
    for name, clf in zip(names, classifiers):
            
        # train classifier
        clf.fit(X_train, y_train)

        # Calculate statistics
        # overall accuracy
        y_pred = clf.predict(X_test)
        accuracy = accuracy + [metrics.accuracy_score(y_test, y_pred)]
        # Kappa score
        K = K + [metrics.cohen_kappa_score(y_test, y_pred)]
        # confusion matrix
        TP = TP + [np.where((y_test==1) & (y_pred==1))[0].size]
        TN = TN + [np.where((y_test!=1) & (y_pred!=1))[0].size]
        FP = FP + [np.where((y_test!=1) & (y_pred==1))[0].size]
        FN = FN + [np.where((y_test==1) & (y_pred!=1))[0].size] 
    
        j+=1

    # Determine best classifier based on accuracy
    results = pd.DataFrame()
    results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
    results['TP'], results['TN'], results['FP'], results['FN'] = TP, TN, FP, FN
    clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
    clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
    print(results)
    print('')
    print('Best accuracy classifier: ' + clf_best_name)

    # -----Save most accurate classifier
    if save_outputs==True:
        clf_fn = out_path+site_IDs[i]+'_classifier.sav'
        pickle.dump(clf_best, open(clf_fn, 'wb'))
        print('Most accurate classifier saved to file: ',clf_fn)
        feature_cols_fn = out_path+site_IDs[i]+'_classifier_feature_cols.pkl'
        pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
        print('Feature columns saved to file: ',feature_cols_fn)

# -----Plot spectral information of classes
fig1, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
ax1.scatter(1*np.ones(len(data_pts_full.loc[data_pts_full['class']==1]['blue'])), data_pts_full.loc[data_pts_full['class']==1]['blue'],
         s=10, color='cyan', label='snow')
ax1.scatter(2*np.ones(len(data_pts_full.loc[data_pts_full['class']==2]['blue'])), data_pts_full.loc[data_pts_full['class']==2]['blue'], 
         s=10, color='blue', label='ice')
ax1.scatter(3*np.ones(len(data_pts_full.loc[data_pts_full['class']==3]['blue'])), data_pts_full.loc[data_pts_full['class']==3]['blue'],  
         s=10, color='orange', label='rock')
ax1.scatter(4*np.ones(len(data_pts_full.loc[data_pts_full['class']==4]['blue'])), data_pts_full.loc[data_pts_full['class']==4]['blue'], 
         s=10, color='grey', label='shadowed snow')
ax1.grid()
ax1.set_title('Blue')
ax2.scatter(1*np.ones(len(data_pts_full.loc[data_pts_full['class']==1]['green'])), data_pts_full.loc[data_pts_full['class']==1]['green'],
         s=10, color='cyan', label='snow')
ax2.scatter(2*np.ones(len(data_pts_full.loc[data_pts_full['class']==2]['green'])), data_pts_full.loc[data_pts_full['class']==2]['green'], 
         s=10, color='blue', label='ice')
ax2.scatter(3*np.ones(len(data_pts_full.loc[data_pts_full['class']==3]['green'])), data_pts_full.loc[data_pts_full['class']==3]['green'],  
         s=10, color='orange', label='rock')
ax2.scatter(4*np.ones(len(data_pts_full.loc[data_pts_full['class']==4]['green'])), data_pts_full.loc[data_pts_full['class']==4]['green'], 
         s=10, color='grey', label='shadowed snow')
ax2.grid()
ax2.set_title('Green')
ax3.scatter(1*np.ones(len(data_pts_full.loc[data_pts_full['class']==1]['red'])), data_pts_full.loc[data_pts_full['class']==1]['red'],
         s=10, color='cyan', label='snow')
ax3.scatter(2*np.ones(len(data_pts_full.loc[data_pts_full['class']==2]['red'])), data_pts_full.loc[data_pts_full['class']==2]['red'], 
         s=10, color='blue', label='ice')
ax3.scatter(3*np.ones(len(data_pts_full.loc[data_pts_full['class']==3]['red'])), data_pts_full.loc[data_pts_full['class']==3]['red'],  
         s=10, color='orange', label='rock')
ax3.scatter(4*np.ones(len(data_pts_full.loc[data_pts_full['class']==4]['red'])), data_pts_full.loc[data_pts_full['class']==4]['red'], 
         s=10, color='grey', label='shadowed snow')
ax3.grid()
ax3.set_title('Red')
ax4.scatter(1*np.ones(len(data_pts_full.loc[data_pts_full['class']==1]['NIR'])), data_pts_full.loc[data_pts_full['class']==1]['NIR'],
         s=10, color='cyan', label='snow')
ax4.scatter(2*np.ones(len(data_pts_full.loc[data_pts_full['class']==2]['NIR'])), data_pts_full.loc[data_pts_full['class']==2]['NIR'], 
         s=10, color='blue', label='ice')
ax4.scatter(3*np.ones(len(data_pts_full.loc[data_pts_full['class']==3]['NIR'])), data_pts_full.loc[data_pts_full['class']==3]['NIR'],  
         s=10, color='orange', label='rock')
ax4.scatter(4*np.ones(len(data_pts_full.loc[data_pts_full['class']==4]['NIR'])), data_pts_full.loc[data_pts_full['class']==4]['NIR'], 
         s=10, color='grey', label='shadowed snow')
ax4.grid()
ax4.set_title('NIR')

fig2, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.scatter(1*np.ones(len(data_pts_full.loc[data_pts_full['class']==1]['NDSI'])), data_pts_full.loc[data_pts_full['class']==1]['NDSI'],
         s=10, color='cyan', label='snow')
ax.scatter(2*np.ones(len(data_pts_full.loc[data_pts_full['class']==2]['NDSI'])), data_pts_full.loc[data_pts_full['class']==2]['NDSI'], 
         s=10, color='blue', label='ice')
ax.scatter(3*np.ones(len(data_pts_full.loc[data_pts_full['class']==3]['NDSI'])), data_pts_full.loc[data_pts_full['class']==3]['NDSI'],  
         s=10, color='orange', label='rock')
ax.scatter(4*np.ones(len(data_pts_full.loc[data_pts_full['class']==4]['NDSI'])), data_pts_full.loc[data_pts_full['class']==4]['NDSI'], 
         s=10, color='grey', label='shadowed snow')
ax.grid()
ax.set_title('NDSI')
plt.show()

### Test one classifier for all sites

In [None]:
# -----Test supervised classification algorithms
# Split data points into features (band values) and target variable (snow)
feature_cols = ['blue', 'green', 'red', 'NIR', 'NDSI']
X = data_pts_full[feature_cols] # features
y = data_pts_full['class'] # target variable

# Split data points into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Iterate over classifiers
j = 0 # loop counter
accuracy = [] # classifier overall accuracy
K = [] # classifier kappa score
TP, TN, FP, FN = [], [], [], [] # confusion matrix components
for name, clf in zip(names, classifiers):

    # train classifier
    clf.fit(X_train, y_train)

    # Calculate statistics
    # overall accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy + [metrics.accuracy_score(y_test, y_pred)]
    # Kappa score
    K = K + [metrics.cohen_kappa_score(y_test, y_pred)]
    # confusion matrix
    TP = TP + [np.where((y_test==1) & (y_pred==1))[0].size]
    TN = TN + [np.where((y_test==0) & (y_pred==0))[0].size]
    FP = FP + [np.where((y_test==0) & (y_pred==1))[0].size]
    FN = FN + [np.where((y_test==1) & (y_pred==0))[0].size] 

    j+=1

# Determine best classifier based on accuracy
results = pd.DataFrame()
results['Classifier'], results['Accuracy'], results['Kappa_score'] = names, accuracy, K
results['TP'], results['TN'], results['FP'], results['FN'] = TP, TN, FP, FN
clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
print(results)
print('')
print('Best accuracy classifier: ' + clf_best_name)

# -----Save most accurate classifier
if save_outputs==True:
    clf_fn = out_path+'all_sites_classifier.sav'
    pickle.dump(clf_best, open(clf_fn, 'wb'))
    print('Most accurate classifier saved to file: ',clf_fn)
    feature_cols_fn = out_path+'all_sites_classifier_feature_cols.pkl'
    pickle.dump(feature_cols, open(feature_cols_fn, 'wb'))
    print('Feature columns saved to file: ',feature_cols_fn)

### Test unsupervised classification algorithms

In [None]:
# -----KMeans
# image 1
# I1_real = ~np.isnan(b1)
# im1_x_mesh, im1_y_mesh = np.meshgrid(im1_x, im1_y)
# im1_x_real = im1_x_mesh[I1_real]
# im1_y_real = im1_y_mesh[I1_real]
# b1_real = b1[I1_real].flatten()
# g1_real = g1[I1_real].flatten()
# r1_real = r1[I1_real].flatten()
# nir1_real = nir1[I1_real].flatten()
# ndsi1_real = ((r1_real - nir1_real)/(r1_real + nir1_real))
# X1 = np.column_stack((b1_real, g1_real, r1_real, nir1_real, ndsi1_real))

# # image 2
# I2_real = ~np.isnan(b2)
# im2_x_mesh, im2_y_mesh = np.meshgrid(im2_x, im2_y)
# im2_x_real = im2_x_mesh[I2_real]
# im2_y_real = im2_y_mesh[I2_real]
# b2_real = b2[I2_real].flatten()
# g2_real = g2[I2_real].flatten()
# r2_real = r2[I2_real].flatten()
# nir2_real = nir2[I2_real].flatten()
# ndsi2_real = ((r2_real - nir2_real)/(r2_real + nir2_real))
# X2 = np.column_stack((b2_real, g2_real, r2_real, nir2_real, ndsi2_real))

# # generate classifier and classify images
# n = 3 # number of clusters
# Y1 = KMeans(n_clusters=n).fit(X1)
# labels1 = Y1.labels_
# Y2 = KMeans(n_clusters=n).fit(X2)
# labels2 = Y2.labels_

# # reshape from flat array to original shape
# clusters1 = np.zeros((np.shape(b1)[0], np.shape(b1)[1]))
# clusters1[:] = np.nan
# clusters1[I1_real] = labels1
# clusters2 = np.zeros((np.shape(b2)[0], np.shape(b2)[1]))
# clusters2[:] = np.nan
# clusters2[I2_real] = labels2

# fig, ax = plt.subplots(2, 2, figsize=(12,12))
# ax[0,0].imshow(np.dstack([r1, g1, b1]))
# ax[0,1].imshow(clusters1)
# ax[1,0].imshow(np.dstack([r2, g2, b2]))
# ax[1,1].imshow(clusters2)
# plt.show()