## Notebook to develop supervised classifier for identifying snow in PlanetScope 4-band imagery
Rainey Aberle

Adapted from the [SciKit Learn Classifier comparison tutorial](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)

### Import packages

In [None]:
import os
import glob
import numpy as np
import rasterio as rio
import geopandas as gpd
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree 
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn import metrics 
import matplotlib.pyplot as plt
import matplotlib
import time
import pickle

### Define paths to directories

In [None]:
# base directory
base_path = '/Users/raineyaberle/Research/PhD/study-sites/Wolverine/'
# image directory
im_path = base_path+'imagery/Planet/2021-04-20_2021-08-25/adjusted-radiometry/'
# classifier output folder (where best classifier will be saved)
out_path = base_path+'../../planet-snow/inputs-outputs/'

### Load image and snow/non-snow classified points

In [None]:
# -----Define EPSG code
epsg = 32606

# -----Load image
im_fn = im_path+'20210815_202055_60_2459_3B_AnalyticMS_SR_clip_PSB.SD_adj.tif'
im = rio.open(im_fn)
# read bands
b = im.read(1).astype(float)
r = im.read(2).astype(float)
g = im.read(3).astype(float)
nir = im.read(4).astype(float)
# define coordinates grid
im_x = np.linspace(im.bounds.left, im.bounds.right, num=np.shape(b)[1])
im_y = np.linspace(im.bounds.top, im.bounds.bottom, num=np.shape(b)[0])
print('Image CRS:',im.crs)

# -----Load snow training points
data_snow_pts_fn = base_path+'classified-points/snow_points.shp'
data_snow_pts = gpd.read_file(data_snow_pts_fn)
# reproject to defined CRS
data_snow_pts = data_snow_pts.to_crs(epsg)
print('Snow points CRS:', data_snow_pts.crs)

# -----Load non-snow points
data_non_snow_pts_fn = base_path+'classified-points/non_snow_points.shp'
data_non_snow_pts = gpd.read_file(data_non_snow_pts_fn)
# Reproject to defined CRS
data_non_snow_pts = data_non_snow_pts.to_crs(epsg)
print('Non-snow points CRS:', data_non_snow_pts.crs)

# -----Plot RGB image, data point locations, and band histograms
fig, (ax1,ax2) = plt.subplots(2, 1, figsize=(8,16), gridspec_kw={'height_ratios': [4,1]})
plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})
# image and point locations
ax1.imshow(np.dstack([r, g, b]), 
           extent=(np.min(im_x), np.max(im_x), np.min(im_y), np.max(im_y)))
data_snow_pts.plot(ax=ax1, markersize=15, color='cyan', label='snow')
data_non_snow_pts.plot(ax=ax1, markersize=15, color='orange', label='non-snow')
ax1.legend(loc='lower right')
ax1.set_xlabel('Easting [m]')
ax1.set_ylabel('Northing [m]')
# histograms
h_b = ax2.hist(b.flatten(), color='blue', histtype='step', linewidth=2, bins=100, label='blue')
h_g = ax2.hist(g.flatten(), color='green', histtype='step', linewidth=2, bins=100, label='green')
h_r = ax2.hist(r.flatten(), color='red', histtype='step', linewidth=2, bins=100, label='red')
h_nir = ax2.hist(nir.flatten(), color='brown', histtype='step', linewidth=2, bins=100, label='NIR')
ax2.set_xlabel('Surface reflectance')
ax2.set_ylabel('Pixel counts')
ax2.set_ylim(0,np.max([h_nir[0][1:], h_g[0][1:], h_r[0][1:], h_b[0][1:]])+5000)
ax2.grid()
ax2.legend(loc='right')
plt.show()

### Set up training data
#### Add 'snow' classification column, merge snow and non-snow points, sample band values at points, and add NDSI column

In [None]:
# -----Add snow classification column to data points
data_snow_pts['snow'] = 1
data_non_snow_pts['snow'] = 0

# -----Merge snow and non-snow points
data_pts = data_snow_pts.append(data_non_snow_pts, ignore_index=True)
# Add coords column
data_pts['coords'] = [(pt.bounds[0], pt.bounds[1]) for pt in data_pts['geometry']]
# remove "id" and "geometry" columns
data_pts = data_pts.drop(columns=['id', 'geometry'])

# -----Sample band values at points
data_pts['blue'] = [x[0] for x in im.sample(data_pts['coords'])]
data_pts['green'] = [x[1] for x in im.sample(data_pts['coords'])]
data_pts['red'] = [x[2] for x in im.sample(data_pts['coords'])]
data_pts['NIR'] = [x[3] for x in im.sample(data_pts['coords'])]

# -----Add NDSI column
data_pts['NDSI'] = (data_pts['red'] - data_pts['NIR']) / (data_pts['red'] + data_pts['NIR'])

print(data_pts)

### Test classifiers

In [None]:
t1 = time.time() # start timer
test_train = True # = True to split training points into testing and training points

# -----Split data points
into features (band values) and target variable (snow)
feature_cols = ['red', 'NIR', 'NDSI'] #['blue', 'green', 'red', 'NIR', 'NDSI']
X = data_pts[feature_cols] # features
y = data_pts['snow'] # target variable

# -----Split data points into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----Save image band values in GeoDataFrame
im_x_mesh, im_y_mesh = np.meshgrid(im_x, im_y)
im_pts = gpd.GeoDataFrame()
im_pts['x'] = im_x_mesh[~np.isnan(b)].flatten()
im_pts['y'] = im_y_mesh[~np.isnan(b)].flatten()
im_pts['blue'] = b[~np.isnan(b)].flatten()
im_pts['green'] = g[~np.isnan(g)].flatten()
im_pts['red'] = r[~np.isnan(r)].flatten()
im_pts['NIR'] = nir[~np.isnan(nir)].flatten()
im_pts['NDSI'] = (im_pts['red'] - im_pts['NIR']) / (im_pts['red'] + im_pts['NIR'])

# -----Classifier names
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

# -----Classifiers
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]
    
# -----Set up figures
# plot RGB image
fig1 = plt.figure(figsize=(8,8))
plt.imshow(np.dstack([r, g, b]), 
            extent=(np.min(im_x)/1000, np.max(im_x)/1000, np.min(im_y)/1000, np.max(im_y)/1000))
plt.title('RGB Image')
plt.ylabel('Northing [km]')
plt.xlabel('Easting [km]')
# classifiers figure
fig, ax = plt.subplots(10,1, figsize=(6,50))
plt.rcParams.update({'font.size': 14, 'font.sans-serif': 'Arial'})

# -----Iterate over classifiers
i=0 # loop counter
accuracy = [] # classifier accuracy
x, y = np.array(im_pts['x']), np.array(im_pts['y']) # image grid numpy arrays (flat)
for name, clf in zip(names, classifiers):
    
    # train classifier
    clf.fit(X_train, y_train)
    
    # calculate accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy + [metrics.accuracy_score(y_test, y_pred)]
    
    # predict snow classification for the full image 
    snow_pred = clf.predict(im_pts[feature_cols])

    # plot results
    ax[i].scatter(x[snow_pred==1]/1000, y[snow_pred==1]/1000, s=0.1, color='cyan', label='snow')
    ax[i].scatter(x[snow_pred==0]/1000, y[snow_pred==0]/1000, s=0.1, color='brown', label='non-snow')
    ax[i].set_title(name + ' | Accuracy: ' + str(np.round(accuracy[i]*100,2)))
    ax[i].set_ylabel('Northing [km]')
    if i==9:
        ax[i].set_xlabel('Easting [km]')

    i+=1

plt.tight_layout()
plt.show()

# -----Determine best classifier based on score and accuracy
results = pd.DataFrame()
results['Classifier'], results['Accuracy'] = names, accuracy
clf_best_name = names[np.where(accuracy==np.max(accuracy))[0][0]]
clf_best = classifiers[np.where(accuracy==np.max(accuracy))[0][0]]
print(results)
print('')
print('Best accuracy classifier: ' + clf_best_name)

# -----Save optimal classifier
clf_fn = out_path+'best_classifier.sav'
pickle.dump(clf_best, open(clf_fn, 'wb'))
 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)

# -----Display time elapsed
t2 = time.time() # stop timer
print('')
print('Time elapsed: ',str((t2-t1)/60),' minutes')

In [None]:
# -----Save optimal classifier
clf_fn = out_path+'best_classifier.sav'
pickle.dump(clf_best, open(clf_fn, 'wb'))
print('classifier saved to file')

### Apply best classifier to other images

In [None]:
# -----Load Planet image file names from directory
ims = os.chdir(im_path) # change directory
im_names = glob.glob('*SR_clip.tif') # load all .tif file names
im_names.sort() # sort file names by date

# -----Loop through images
for im_name in im_names:
    
     # open image
    im = rio.open(im_name)

    # extract date from image name
    date = im_name[0:4] + '-' + im_name[4:6] + '-' + im_name[6:8]
    dates = dates + [np.datetime64(date)]

    # define bands 
    b = im.read(1).astype(float) 
    g = im.read(2).astype(float) 
    r = im.read(3).astype(float) 
    nir = im.read(4).astype(float) 
    # compute MNDSI
    mndsi = es.normalized_diff(r, nir) 
    
    # define coordinates grid
    x = np.linspace(im.bounds.left, im.bounds.right, num=np.shape(b)[1])
    y = np.linspace(im.bounds.top, im.bounds.bottom, num=np.shape(b)[0])
    
    # predict snow-covered pixels with classifier
    snow_pred = clf.predict(im_pts[feature_cols])