# Weed Mapping using SVM - Thistle - 8 bands
### Used after collecting sampling data

In [6]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import rasterio
import rioxarray
import geopandas as gpd
import os
import glob
import pickle
import re
from joblib import Parallel, delayed
from simpledbf import Dbf5


from sklearn import svm, metrics
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.inspection import permutation_importance
from ttictoc import tic,toc

PyTables is not installed. No support for HDF output.


In [2]:
# Band combinations
_8bands = ['1_red','1_green','1_blue','2_blue','2_green','3_red','4_re','2_nir']
_5bands = ['2_blue','2_green','3_red','4_re','2_nir']
_3bands = ['1_red','1_green','1_blue']


pathsMaj = glob.glob('E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_majority_*.dbf')
pathsMin = glob.glob('E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_minority_*.dbf')

### Derive K-means centers for 'majority' class

In [4]:
def peform_k_clustering(samplesDf, bandselection, species):
    
    # k is similar to number of samples in minority class
    k = 255
    
    if species=='T':
        label = 'majorityT'
    else:
        label = 'majorityH'

    df = samplesDf[samplesDf['class']==label].loc[:,bandselection] # Get minority samples
    arr = df.to_numpy()

    # STEP 2: Perform Kmeans
    kmeansBatch = MiniBatchKMeans(n_clusters=k, init='k-means++',
                    n_init=20, max_iter=200,
                    verbose=0, random_state=0,
                    batch_size=1024).fit(arr)

    dfOut = pd.DataFrame(kmeansBatch.cluster_centers_, columns=bandselection)
    dfOut['class'] = 0 # majority label
    
    dfMin = samplesDf[samplesDf['class']==species].loc[:,bandselection]
    dfMin['class'] = 1 # minority/target label
    dfOut = pd.concat([dfOut,dfMin], axis=0)
    
    return(dfOut)

### Step 3: Grid Search

In [21]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def grid_search(trainDf, testDf, modelPath, bandselection, tuned_parameters, scores, species, seed, outCsv):
    trainDf = peform_k_clustering(trainDf, bandselection, species)
    
    # Manipulate test dataset
    if species=='T':
        testDf = testDf.loc[(testDf['class']=='T')|(testDf['class']=='majorityT')]
        testDf.loc[:,'class']  = np.where(testDf['class']=='T', 1, 0)
        
    else:    
        testDf = testDf.loc[(testDf['class']=='H')|(testDf['class']=='majorityH')]
        testDf.loc[:,'class']  = np.where(testDf['class']=='H', 1, 0)
    
#     for score in scores:
#         print("# Tuning hyper-parameters for %s" % score)
#         print()

#         clf = GridSearchCV(SVC(), tuned_parameters, scoring=score, verbose=3, n_jobs=11, pre_dispatch = '2*n_jobs')
#         clf.fit(trainDf.loc[:, bandselection].to_numpy(), trainDf.loc[:,'class'].to_numpy())

#         print("Best parameters set found on development set:")
#         print()
#         print(clf.best_params_)
#         print()
#         print("Grid scores on development set:")
#         print()
#         means = clf.cv_results_["mean_test_score"]
#         stds = clf.cv_results_["std_test_score"]
#         for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
#             print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
#         print()

#         print("Detailed classification report:")
#         print()
#         print("The model is trained on the full development set.")
#         print("The scores are computed on the full evaluation set.")
#         print()
#         y_true, y_pred = testDf.loc[:,'class'].to_numpy(), clf.predict(testDf.loc[:,bandselection].to_numpy())
#         print(classification_report(y_true, y_pred))
#         print()
        
#         print("Classification matrix:")
#         print()
#         matrix = metrics.confusion_matrix(y_true, y_pred)
#         print(matrix)
#         print()
           

     
    print("# Tuning hyper-parameters for %s" % scores)
    print()
    
    clf = GridSearchCV(SVC(), tuned_parameters, scoring=scores, verbose=3, n_jobs=11, pre_dispatch = '2*n_jobs')
    clf.fit(trainDf.loc[:, bandselection].to_numpy(), trainDf.loc[:,'class'].to_numpy())
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = testDf.loc[:,'class'].to_numpy(), clf.predict(testDf.loc[:,bandselection].to_numpy())
    report = classification_report(y_true, y_pred)
    print(report)
    print()

    print("Classification matrix:")
    print()
    matrix = metrics.confusion_matrix(y_true, y_pred)
    print(matrix)
    print()
    
    modelPath = modelPath.format(seed, species, str(len(bandselection)))
    pickle.dump(clf, open(modelPath, 'wb'))
    
    # organize metrics
    recallMaj,  precisionMaj, fMaj, supportMaj = report.split()[5:9]
    recallMin,  precisionMin, fMin, supportMin = report.split()[10:14]
    recallMacro,  precisionMacro, fMacro, supportMacro = report.split()[19:23]
    accuracy = report.split()[15]
    
    # Save to CSV file
    myCsvRow = [seed, sp, str(len(bandselection)), clf.best_params_['C'], recallMaj,  precisionMaj, fMaj, supportMaj, recallMin,  precisionMin, fMin, supportMin, recallMacro,  precisionMacro, fMacro, supportMacro, accuracy]
    myCsvRow = ';'.join(map(str, myCsvRow))
    
    with open(outCsv,'a') as fd:
        fd.write(myCsvRow)
        fd.write("\n")
    

In [22]:
# Set the parameters by cross-validation

tuned_parameters = [
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]}]

#scores = ["precision_macro", "recall_macro", metrics.make_scorer(metrics.f1_score, labels=[1], average='macro')]
#scores = (metrics.make_scorer(metrics.f1_score, labels=[1], average='macro'))
scores = "f1_weighted"

# ---------------------------------------------------------------------------------------------------------------------

# Run all models, for all sample datasets (no. = 10), all band combinations (no. = 3), and species (no.= 2)

for i in range(len(pathsMaj)): # Loop (and read) through sample datasets using 10 different random seeds
   
    # derive seed number from filepath
    seed = re.findall(r'\d+', pathsMaj[i])
    
    # preprare training and test datasets
    minority = Dbf5(pathsMin[i]).to_dataframe()
    minority.rename(columns={'Class':'class'}, inplace=True)
    
    majority = Dbf5(pathsMaj[i]).to_dataframe()
    majority.rename(columns={'band1':'1_red', 'band2':'1_green', 'band3':'1_blue', 'band4':'2_blue', 'band5':'2_green', 'band6':'3_red', 'band7':'4_re', 'band8':'2_nir'}, inplace=True)
    
    df = pd.concat([minority, majority], axis=0).drop(columns=['class_1', 'x', 'y', 'ID', 'index_righ'])

    print(df.head())

    
    trainDf = df.loc[df['split']=='train'] 
    testDf = df.loc[df['split']=='test']
    
    
    # Derive species from filepath
    if 'spH' in pathsMaj[i]:
        sp = 'H'
    else:
        sp = 'T'
        
    
    for b in [_3bands, _5bands, _8bands]:
        kdf = peform_k_clustering(trainDf, b, sp)
            
        print(f'Grid search is performed for species {sp} using {str(len(b))} bands and seed{str(seed)}')
        out = grid_search(trainDf, testDf, "E:\\Sync\\_Documents\\_Letter_invasives\\_Data\_models\\_bestmodel_seed{}_sp{}_b{}.sav", b, tuned_parameters, scores, sp, seed, 'E:\Sync\_Documents\_Letter_invasives\_Data\_models\_modellingOutputs.csv')
        print()
        print('_____________________________________________________')
        print()

  class Quadrat     1_red   1_green    1_blue    2_blue   2_green     3_red  \
0     H     2_6  0.941176  0.937255  0.537255  0.013913  0.054563  0.021669   
1     H     2_6  0.984314  0.972549  0.466667  0.028552  0.064575  0.045270   
2     H     2_6  0.960784  0.972549  0.478431  0.049925  0.084581  0.075656   
3     H     2_6  0.925490  0.909804  0.490196  0.028602  0.068072  0.052862   
4     H     2_6  0.972549  0.956863  0.549020  0.028602  0.068072  0.052862   

       4_re     2_nir  split  
0  0.152925  0.304192  train  
1  0.158581  0.257337  train  
2  0.180739  0.270967  train  
3  0.158105  0.264038  train  
4  0.158105  0.264038  train  
Grid search is performed for species H using 3 bands and seed['101']
# Tuning hyper-parameters for make_scorer(f1_score, labels=[1], average=macro)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters set found on development set:

{'C': 100, 'kernel': 'linear'}

Grid scores on development set:

0.963 (+/-0.032) f

### Summarize output

In [24]:
csv = pd.read_csv("E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_models\\_modellingOutputs _edited.csv")

In [25]:
csv.head()

Unnamed: 0,Seed,Species,Bands,C,Pre_maj,Rec_maj,F1_maj,Support_maj,Pre_min,Rec_min,F1_min,Support_min,Pre_avg,Rec_avg,F1_avg,Support_avg,Accuracy,TP,FP,FN
0,['101'],H,3,100,1,0.98,0.99,179237,0.03,1.0,0.05,75,0.51,0.99,0.52,179312,0.98,75,2888,0
1,['101'],H,5,100,1,0.81,0.89,179237,0.0,0.41,0.0,75,0.5,0.61,0.45,179312,0.8,31,34948,44
2,['101'],H,8,100,1,0.99,1.0,179237,0.07,0.99,0.13,75,0.53,0.99,0.56,179312,0.99,74,987,1
3,['103'],H,3,100,1,0.98,0.99,179237,0.03,1.0,0.05,75,0.51,0.99,0.52,179312,0.98,75,2756,0
4,['103'],H,5,1000,1,0.84,0.92,179237,0.0,0.41,0.0,75,0.5,0.63,0.46,179312,0.84,31,27995,44


In [43]:
groupbyDf = csv.drop(columns=['Seed']).groupby(['Species','Bands']).agg(['max'])

In [44]:
groupbyDf.transpose()

Unnamed: 0_level_0,Species,H,H,H,T,T,T
Unnamed: 0_level_1,Bands,3,5,8,3,5,8
C,min,10.0,100.0,10.0,100.0,1000.0,1000.0
Pre_maj,min,1.0,1.0,1.0,1.0,1.0,1.0
Rec_maj,min,0.98,0.81,0.99,0.98,0.99,0.98
F1_maj,min,0.99,0.89,1.0,0.99,1.0,0.99
Support_maj,min,179237.0,179237.0,179237.0,176142.0,176142.0,176142.0
Pre_min,min,0.02,0.0,0.05,0.03,0.06,0.02
Rec_min,min,0.97,0.31,0.99,0.99,0.99,0.99
F1_min,min,0.05,0.0,0.09,0.05,0.12,0.04
Support_min,min,75.0,75.0,75.0,98.0,98.0,98.0
Pre_avg,min,0.51,0.5,0.52,0.51,0.53,0.51


### Prepare and predict raster

In [224]:
def predict_to_csv(gdf, svm_path, out_path):
    # Setup
    svm_model = pickle.load(open(svm_path, 'rb')) # load svm model using Pickle
    
    if 'b3' in svm_path:
        bandselection = ['band1','band2','band3']

    elif 'b5' in svm_path:
        bandselection = ['band4','band5','band6','band7','band8']

    else:
        bandselection = ['band1','band2','band3','band4','band5','band6','band7','band8']

        
    arr = gdf.loc[:,bandselection].to_numpy()
    
    with sklearn.config_context(working_memory=3000, assume_finite=False):
        prediction = svm_model.predict(arr)
        
    gdf = pd.concat([gdf, pd.Series(prediction, name='detect')],axis=1)
    gdf = gdf.loc[gdf['detect']==1]

    out_path = out_path+os.path.basename(svm_path).split('.')[0]+'.shp'
    gdf.to_file(out_path, index=False)

Overwriting tile_predictions.py


In [233]:
for path in glob.glob("E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_models\\*.sav"): 
    predict_to_csv(gdf, path, 'E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_predictions\\')

yes
yes


### TP, FP and FN by quadrat based on test datasets

In [91]:
modelPredT = ["E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp"] #,"E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b5.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b8.shp"] 
modelPredH = ["E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b3.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b5.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b8.shp"]

majTestT = gpd.read_file("E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_majority_spT_seed54.shp").set_crs('EPSG:26911', allow_override=True)
majTestT = majTestT.loc[majTestT['split']=='test']
majTestH = gpd.read_file("E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_majority_spH_seed103.shp").set_crs('EPSG:26911', allow_override=True)
majTestH = majTestH.loc[majTestH['split']=='test']

minTestT = gpd.read_file("E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_minority_spT_seed54.shp").set_crs('EPSG:26911', allow_override=True)
minTestT = minTestT.loc[minTestT['split']=='test']
minTestH = gpd.read_file("E:\Sync\_Documents\_Letter_invasives\_Data\_samples\_minority_spH_seed103.shp").set_crs('EPSG:26911', allow_override=True)
minTestH = minTestH.loc[minTestH['split']=='test']


In [92]:
transects = gpd.read_file('E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\Transect_grids.shp').set_crs('EPSG:26911', allow_override=True)
transects = transects.loc[:,['Quadrat', 'geometry']]
transectNames = transects['Quadrat'].unique()
print(transectNames)

['3_3' '2_8' '1_2' '3_2' '3_1' '1_3' '1_1' '2_2' '2_7' '2_5' '2_9' '2_4'
 '2_10' '2_1' '2_3' '0_0' '2_6']


In [111]:
def classReportByQuadrat(predGdfPaths, majTestGdf, minTestGdf, transectsGdf):
    minTestGdf2 = minTestGdf.copy()
    minTestGdf2['geometry'] = minTestGdf2.buffer(distance=0.005)
    
    paths, trans, tps, fns, fps = [], [], [], [], []
    for i in range(len(predGdfPaths)):
        pred = gpd.read_file(predGdfPaths[i]).set_crs('EPSG:26911', allow_override=True)
        pred = pred.sjoin(transectsGdf)
        
        for tr in transects['Quadrat'].unique():
            pred2 = pred.loc[pred['Quadrat']==tr,:]
            
            
            minSlct = pred2.overlay(minTestGdf2, how='intersection')
            majSlct = pred2.overlay(majTestGdf, how='intersection')
            
            tp = len(minSlct)
            fp = len(majSlct)

            fn =  len(minTestGdf2.loc[minTestGdf['Quadrat']==tr]) - tp
            
            print(modelPredT[i], 'transect: '+tr, 'tp: '+str(tp), 'fn: '+str(fn), 'fp: '+str(fp))
            
            paths.append(modelPredT[i])
            trans.append(tr)
            tps.append(tp)
            fns.append(fn)
            fps.append(fp)
    
    return(paths, trans, tps, fns, fps)

In [104]:
modelPredT = ["E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b5.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b8.shp"] 
pathsT, transT, tpsT, fnsT, fpsT = classReportByQuadrat(modelPredT, majTestT, minTestT, transects)

E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_3 tp: 0 fn: 0 fp: 18
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 2_8 tp: 0 fn: 0 fp: 207
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_2 tp: 0 fn: 0 fp: 163
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_2 tp: 0 fn: 0 fp: 60
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_1 tp: 0 fn: 0 fp: 119
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_3 tp: 0 fn: 0 fp: 8
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_1 tp: 0 fn: 0 fp: 145
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 2_2 tp: 30 fn: 0 fp: 314
E:\Sync\_Documents\

In [114]:
modelPredH = ["E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b3.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b5.shp", "E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['103']_spH_b8.shp"]
pathsH, transH, tpsH, fnsH, fpsH = classReportByQuadrat(modelPredH, majTestH, minTestH, transects)

E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_3 tp: 0 fn: 0 fp: 179
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 2_8 tp: 0 fn: 0 fp: 23
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_2 tp: 0 fn: 0 fp: 74
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_2 tp: 0 fn: 0 fp: 1926
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 3_1 tp: 0 fn: 0 fp: 290
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_3 tp: 2 fn: 0 fp: 57
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 1_1 tp: 0 fn: 0 fp: 89
E:\Sync\_Documents\_Letter_invasives\_Data\_predictions\_bestmodel_seed['54']_spT_b3.shp transect: 2_2 tp: 0 fn: 0 fp: 2
E:\Sync\_Documents\_L

In [113]:
dfH = pd.DataFrame({'path': pathsH, 'quadrat': transH, 'TP': tpsH, 'FN': fnsH, 'FP':fpsH})
dfH.to_csv("E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_byquadrat\\hawkweed.csv")