# Classification accuracy assessment at Lemon Creek Glacier

Rainey Aberle

2022

In [1]:
# -----Import packages
import os
import glob
import numpy as np
import geopandas as gpd
import pandas as pd
import scipy
import xarray as xr
import rioxarray as rxr
import rasterio as rio
from shapely.geometry import Polygon
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics 
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import sys
import pickle

In [2]:
# -----Define paths in directory
site_name = 'LemonCreek'
# base directory (path to snow-cover-mapping/)
base_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping/'
# output folder for best classifier
out_path = base_path + 'inputs-outputs/'
# path to classified points used to train and test classifiers
data_pts_path = base_path + '../classified-points/'
# path to AOI shapefiles
AOI_path = base_path + '../study-sites/' 

# -----Determine settings
terrain_parameters = False # whether to use terrain parameters (elevation, slope, aspect) in classification
save_figures = True # whether to save output figures

# -----Add path to functions
sys.path.insert(1, base_path + 'functions/')
import pipeline_utils as f

# -----Load dataset characteristics dictionary
with open(base_path + 'inputs-outputs/datasets_characteristics.pkl', 'rb') as fn:
    dataset_dict = pickle.load(fn)

# -----Load classified points
os.chdir(data_pts_path)
data_pts_fns = glob.glob('*LemonCreek*.shp')
data_pts_fns = sorted(data_pts_fns)
data_pts_fns

['LemonCreek_20210729_20_no-snow.shp',
 'LemonCreek_20210729_20_snow.shp',
 'LemonCreek_20210822_19_no-snow.shp',
 'LemonCreek_20210822_19_snow.shp']

## PlanetScope

In [3]:
# -----Load trained classifier and feature columns
clf_fn = base_path+'inputs-outputs/PS_classifier_all_sites.sav'
clf = pickle.load(open(clf_fn, 'rb'))
feature_cols_fn = base_path+'inputs-outputs/PS_feature_cols.pkl'
feature_cols = pickle.load(open(feature_cols_fn,'rb'))

# -----Subset dataset dictionary
dataset = 'PlanetScope'
ds_dict = dataset_dict[dataset]

# -----Set up testing data
# path to images
im_path = base_path + '../snowline-package/' + site_name + '/images/'
# determine number of images used for classified points
num_images = len([s for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s)])
im_dates = [s[len(site_name)+1:len(site_name)+9] for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s)]
# loop through each image
for j in range(0, num_images):
    # determine image date
    im_date = im_dates[j]
    # load classified points
    data_pts = pd.DataFrame() # dataframe to hold applicable data classes
    # no-snow
    if len([s for s in data_pts_fns if (site_name in s) and ('_no-snow.shp' in s) and (im_date in s)])>0: # check if class exists for site and date
        data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_no-snow.shp' in s) and (im_date in s)][0]
        data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
        data_pts_snow['class'] = 0 # determine class ID
        data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
        # print(data_pts_snow_fn)
    # snow
    if len([s for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s) and (im_date in s)])>0: # check if class exists for site and date
        data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s) and (im_date in s)][0]
        data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
        data_pts_snow['class'] = 1 # determine class ID
        data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
        # print(data_pts_snow_fn)

    # Load image
    Idate = data_pts_snow_fn.index('_')+1
    im_fn = data_pts_snow_fn[Idate:Idate+11]+'_adj.tif' # image file name
    im_date = im_fn[0:4]+'-'+im_fn[4:6]+'-'+im_fn[6:8] # image capture date
    im = xr.open_dataset(im_path + im_fn)
    # remove no data values and account for image scalar
    im = im.where(im!=-9999)
    im = im / 1e4
    # define bands
    im['blue'] = im['band_data'][0]
    im['green'] = im['band_data'][1]
    im['red'] = im['band_data'][2]
    im['NIR'] = im['band_data'][3]
    im['NDSI'] = (im['green'] - im['NIR']) / (im['green'] + im['NIR'])    
    
    # reproject data points to image CRS
    data_pts = data_pts.to_crs(im.rio.crs)
    data_pts = data_pts.drop(columns=['id'])
    data_pts = data_pts.dropna()
    # grab x and y sample points
    x = [x.geoms[0].coords.xy[0][0] for x in data_pts.geometry]
    y = [x.geoms[0].coords.xy[1][0] for x in data_pts.geometry]
    # sample image values at data points
    for band in feature_cols:
        data_pts[band] = [im[band].sel(x=x, y=y, method='nearest').data for x, y in list(zip(x, y))]
    if j==0:
        data_pts_full = data_pts
    else:
        data_pts_full = pd.concat([data_pts_full, data_pts]) 
# Reduce memory usage in data pts
data_pts_full = data_pts_full.reset_index(drop=True)
data_pts_full = f.reduce_memory_usage(data_pts_full)

# -----Test the trained classifier
# features
X = data_pts_full[feature_cols] 
# target variable
y = data_pts_full['class'] 
# Predict class values using trained classifier
y_pred = clf.predict(X)
# Adjust outputs to only test snow and no-snow
y_pred[y_pred <= 2] = 1 # snow = 1, 2
y_pred[y_pred > 2] = 0 # no-snow = 3, 4, 5
# Calculate overall accuracy
accuracy = metrics.accuracy_score(y, y_pred)
# Calculate Kappa score
K = metrics.cohen_kappa_score(y, y_pred)
# Calculate confusion matrix
CM = metrics.confusion_matrix(y, y_pred)
# Print results
print('PlanetScope')
print('----------')
print('n='+str(len(y_pred)))
print('Overall accuracy: ' + str(accuracy))
print('Kappa score: ' + str(K))
print('Confusion matrix:')
CM


Mem. usage decreased to 0.09 Mb (12.5% reduction)
PlanetScope
----------
Overall accuracy: 0.9104104104104104
Kappa score: 0.8210019709492689
Confusion matrix:


array([[864,  34],
       [145, 955]])

In [13]:
im_dates

['20210729', '20210822']

## Landsat

In [12]:
# -----Load trained classifier and feature columns
clf_fn = base_path+'inputs-outputs/L_classifier_all_sites.sav'
clf = pickle.load(open(clf_fn, 'rb'))
feature_cols_fn = base_path+'inputs-outputs/L_feature_cols.pkl'
feature_cols = pickle.load(open(feature_cols_fn,'rb'))

# -----Subset dataset dictionary
dataset = 'Landsat'
ds_dict = dataset_dict[dataset]

# -----Set up testing data
# load image
im_path = base_path + '../study-sites/' + site_name + '/imagery/Landsat/masked/'
ims_fn = glob.glob(im_path + '*masked.nc')
im_fn = [im_fn for im_fn in ims_fn if '20210823' in im_fn][0]
im = xr.open_dataset(im_fn)
# load classified points
PS_im_date = '20210822'
data_pts = pd.DataFrame() # dataframe to hold applicable data classes
# no-snow
data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_no-snow.shp' in s) and (PS_im_date in s)][0]
data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
data_pts_snow['class'] = 0 # determine class ID
data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
# print(data_pts_snow_fn)
# snow
data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s) and (PS_im_date in s)][0]
data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
data_pts_snow['class'] = 1 # determine class ID
data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
# print(data_pts_snow_fn)
# reproject data points to image CRS
data_pts = data_pts.to_crs('EPSG:32608')
data_pts = data_pts.drop(columns=['id'])
data_pts = data_pts.dropna()
# grab x and y sample points
x = [x.geoms[0].coords.xy[0][0] for x in data_pts.geometry]
y = [x.geoms[0].coords.xy[1][0] for x in data_pts.geometry]
# sample image values at data points
for band in feature_cols:
    data_pts[band] = [im[band].sel(x=x, y=y, method='nearest').data for x, y in list(zip(x, y))]
# remove rows with no data
idrop = [~np.isnan(x) for x in data_pts['SR_B2']]
data_pts = data_pts.iloc[idrop].reset_index(drop=True)
    
# reduce memory usage in data pts
data_pts = f.reduce_memory_usage(data_pts)

# -----Test the trained classifier
# features
X = data_pts[feature_cols] 
# target variable
y = data_pts['class'] 
# Predict class values using trained classifier
y_pred = clf.predict(X)
# Adjust outputs to only test snow and no-snow
y_pred[y_pred <= 2] = 1 # snow = 1, 2
y_pred[y_pred > 2] = 0 # no-snow = 3, 4, 5
# Calculate overall accuracy
accuracy = metrics.accuracy_score(y, y_pred)
# Calculate Kappa score
K = metrics.cohen_kappa_score(y, y_pred)
# Calculate confusion matrix
CM = metrics.confusion_matrix(y, y_pred)
# Print results
print('Landsat')
print('----------')
print('Overall accuracy: ' + str(accuracy))
print('Kappa score: ' + str(K))
print('Confusion matrix:')
CM

['2021-08-23T20:00:38.000000000']
Mem. usage decreased to 0.05 Mb (9.7% reduction)
Landsat
----------
Overall accuracy: 0.9038929440389294
Kappa score: 0.8074507641125086
Confusion matrix:


array([[399,  18],
       [ 61, 344]])

## Sentinel-2 SR

In [16]:
# -----Load images
im_path = base_path + '../study-sites/' + site_name + '/imagery/Sentinel-2/masked/'
im1_fn = im_path + 'LemonCreek_Sentinel2_20210730T202846_masked.nc'
im1 = xr.open_dataset(im1_fn)
im2_fn = im_path + 'LemonCreek_Sentinel2_20210831T201849_masked.nc'
im2 = xr.open_dataset(im2_fn)
im_list = [im1, im2]

# -----Load trained classifier and feature columns
clf_fn = base_path+'inputs-outputs/S2_classifier_all_sites.sav'
clf = pickle.load(open(clf_fn, 'rb'))
feature_cols_fn = base_path+'inputs-outputs/S2_feature_cols.pkl'
feature_cols = pickle.load(open(feature_cols_fn,'rb'))

# -----Subset dataset dictionary
dataset = 'Sentinel2'
ds_dict = dataset_dict[dataset]

# -----Set up testing data
for i, im in enumerate(im_list):
    data_pts = pd.DataFrame() # dataframe to hold applicable data classes
    # no-snow
    data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_no-snow.shp' in s) and (im_dates[i] in s)][0]
    data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
    data_pts_snow['class'] = 0 # determine class ID
    data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
    # print(data_pts_snow_fn)
    # snow
    data_pts_snow_fn = [s for s in data_pts_fns if (site_name in s) and ('_snow.shp' in s) and (im_dates[i] in s)][0]
    data_pts_snow = gpd.read_file(data_pts_path + data_pts_snow_fn) # read file
    data_pts_snow['class'] = 1 # determine class ID
    data_pts = pd.concat([data_pts, data_pts_snow], ignore_index=True) # concatenate to full data points df
    # print(data_pts_snow_fn)
    # reproject data points to image CRS
    data_pts = data_pts.to_crs('EPSG:32608')
    data_pts = data_pts.drop(columns=['id'])
    data_pts = data_pts.dropna()
    # grab x and y sample points
    x = [x.geoms[0].coords.xy[0][0] for x in data_pts.geometry]
    y = [x.geoms[0].coords.xy[1][0] for x in data_pts.geometry]
    # sample image values at data points
    for band in feature_cols:
        data_pts[band] = [im[band].sel(x=x, y=y, method='nearest').data for x, y in list(zip(x, y))]
    # remove rows with no data
    idrop = [~np.isnan(x) for x in data_pts['B2']]
    data_pts = data_pts.iloc[idrop].reset_index(drop=True)
    # concatenate to full data points
    if i==0:
        data_pts_full = data_pts
    else:
        data_pts_full = pd.concat([data_pts_full, data_pts])
# reduce memory usage in data pts
data_pts_full = f.reduce_memory_usage(data_pts_full)
data_pts_full
    
# -----Test the trained classifier
# features
X = data_pts_full[feature_cols] 
# target variable
y = data_pts_full['class'] 
# Predict class values using trained classifier
y_pred = clf.predict(X)
# Adjust outputs to only test snow and no-snow
y_pred[y_pred <= 2] = 1 # snow = 1, 2
y_pred[y_pred > 2] = 0 # no-snow = 3, 4, 5
# Calculate overall accuracy
accuracy = metrics.accuracy_score(y, y_pred)
# Calculate Kappa score
K = metrics.cohen_kappa_score(y, y_pred)
# Calculate confusion matrix
CM = metrics.confusion_matrix(y, y_pred)
# Print results
print('Sentinel-2')
print('----------')
print('n='+str(len(y_pred)))
print('Overall accuracy: ' + str(accuracy))
print('Kappa score: ' + str(K))
print('Confusion matrix:')
CM

Mem. usage decreased to 0.15 Mb (7.3% reduction)
Sentinel-2
----------
n=1824
Overall accuracy: 0.9764254385964912
Kappa score: 0.9516203052598854
Confusion matrix:


array([[ 744,    4],
       [  39, 1037]])