# Split target and absent-class samples

In [1]:
import pandas as pd
import geopandas as gpd
import fiona
from osgeo import ogr, gdal, osr
import os, sys
from osgeo import ogr
from math import ceil
import rasterio

from sklearn.model_selection import train_test_split

### Create fishnet

In [2]:
def fisnettool(outputGridfn,xmin,xmax,ymin,ymax,gridHeight,gridWidth):
    #Source / documentation: https://varunpant.com/posts/how-to-create-fishnets-or-geospatial-grids/

    # convert sys.argv to float
    xmin = float(xmin)
    xmax = float(xmax)
    ymin = float(ymin)
    ymax = float(ymax)
    gridWidth = float(gridWidth)
    gridHeight = float(gridHeight)

    # get rows
    rows = ceil((ymax-ymin)/gridHeight)
    # get columns
    cols = ceil((xmax-xmin)/gridWidth)

    # start grid cell envelope
    ringXleftOrigin = xmin
    ringXrightOrigin = xmin + gridWidth
    ringYtopOrigin = ymax
    ringYbottomOrigin = ymax-gridHeight

    # create output file
    outDriver = ogr.GetDriverByName('ESRI Shapefile')
    if os.path.exists(outputGridfn):
        os.remove(outputGridfn)
    outDataSource = outDriver.CreateDataSource(outputGridfn)
    outLayer = outDataSource.CreateLayer(outputGridfn, geom_type=ogr.wkbPolygon )
    featureDefn = outLayer.GetLayerDefn()

    # create grid cells
    countcols = 0
    while countcols < cols:
        countcols += 1

        # reset envelope for rows
        ringYtop = ringYtopOrigin
        ringYbottom =ringYbottomOrigin
        countrows = 0

        while countrows < rows:
            countrows += 1
            ring = ogr.Geometry(ogr.wkbLinearRing)
            ring.AddPoint(ringXleftOrigin, ringYtop)
            ring.AddPoint(ringXrightOrigin, ringYtop)
            ring.AddPoint(ringXrightOrigin, ringYbottom)
            ring.AddPoint(ringXleftOrigin, ringYbottom)
            ring.AddPoint(ringXleftOrigin, ringYtop)
            poly = ogr.Geometry(ogr.wkbPolygon)
            poly.AddGeometry(ring)

            # add new geom to layer
            outFeature = ogr.Feature(featureDefn)
            outFeature.SetGeometry(poly)
            outLayer.CreateFeature(outFeature)
            outFeature.Destroy

            # new envelope for next poly
            ringYtop = ringYtop - gridHeight
            ringYbottom = ringYbottom - gridHeight

        # new envelope for next poly
        ringXleftOrigin = ringXleftOrigin + gridWidth
        ringXrightOrigin = ringXrightOrigin + gridWidth

    # Close DataSources
    outDataSource.Destroy()


In [3]:
# Main function running Fishnettool and selecting centroids within quadrat polyons
# Arguments:
    # Input polygon filename - Outline of quadrats
    # Temporaray polygon output filename - For storing the fishnet
    # Output polygon filename - Selected fishnet' centroids within quadrats
    # CRS
    # Size of grid in meters

def run_fisnettool(inPoly, outTmpFn, inCrs='EPSG:26910', gridDistance=0.02):
    
    # using extent of shapefile
    minx, miny, maxx, maxy = inPoly.total_bounds # Returns minx, miny, maxx, maxy 
    
    # Use Fishnet Too.
    fisnettool(outTmpFn, minx, maxx, miny, maxy, gridDistance, gridDistance)
    
    
  

In [4]:
gdfPoly = gpd.read_file('E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\Transect_grids.shp')

In [5]:
# Run tools by transect
run_fisnettool(inPoly=gdfPoly.loc[gdfPoly['Quadrat'].isin(['3_1','3_2','3_3'])],
               outTmpFn='E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_fishnetgrid_tr3.shp',
               inCrs='EPSG:26910',
               gridDistance=0.02)

In [6]:
run_fisnettool(inPoly=gdfPoly.loc[gdfPoly['Quadrat'].isin(['1_1','1_2','1_3'])],
               outTmpFn='E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_fishnetgrid_tr1.shp',
               inCrs='EPSG:26910',
               gridDistance=0.02)

In [7]:
run_fisnettool(inPoly=gdfPoly.loc[gdfPoly['Quadrat'].isin(['2_1','2_2','2_3','2_4','2_5','2_6','2_7','2_8','2_9','2_10'])],
               outTmpFn='E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_fishnetgrid_tr2.shp',
               inCrs='EPSG:26910',
               gridDistance=0.02)

### Train-Val split of absent-class samples using Fishnets

In [7]:
def sample_with_fishnet(pixelPointsFn, fishnetsList, inCrs, outFn, species):
    # pixelPointsFn: are absent-class pixels selected in ArcGIS Pro converted to points
    gdfPoly = gpd.read_file(pixelPointsFn, crs=inCrs)
    
    fishnets = []
    for i in range(len(fishnetsList)):
        gdfFishnet = gpd.read_file(fishnetsList[i], crs=inCrs)
        gdfFishnet['ID'] = str(i)+'_'+gdfFishnet['FID'].astype(str)
        fishnets.append(gdfFishnet)
    gdfFishnet = pd.concat(fishnets, axis=0)
    
    gdfPoly = gdfPoly.sjoin(gdfFishnet.loc[:,['ID', 'geometry']], how='left')
    gdfPoly.drop(columns='index_right', inplace=True)
                                                
    for j in [4,7,9,19,54,83,101,103,115,120]: # list of random seeds                                       
        sampleDf = gdfPoly.groupby('ID').sample(n=1, random_state=j) # sampling
        
        # Label the randomly selected points in majority DF, per species
        sampleDf['split'] = 'test' # add column for train/test
        sampleDf = gdfPoly.sjoin(sampleDf.loc[:,['split','geometry']], how='left') # spatial join

        sampleDf['split'].fillna('train', inplace=True) # fill na with 'train'
        
        if species == 'T':
            sampleDf['class'] = 'majorityT' # add new class column
        else:
            sampleDf['class'] = 'majorityH' # add new class column

        sampleDf.to_file(outFn.format(species, str(j)), crs=inCrs)
    

In [None]:
# creates 10 independed randomly split point datasets by species
test = sample_with_fishnet("E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_quadrats\\QuadratsRas2PolyT.shp",
                          ['E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_quadrats\\_fishnetgrid_tr1.shp', 'E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_quadrats\\_fishnetgrid_tr2.shp', 'E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_quadrats\\_fishnetgrid_tr3.shp'],
                          inCrs='EPSG:26910',
                          outFn="E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_samples\\_majority_sp{}_seed{}.shp",
                          species='T')

### Derive spectral values for present-class samples

In [57]:
# Specify paths

gdfMinority = gpd.read_file("E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\Training_samples_both_species2.shp").set_crs('EPSG:26911')
rasPath = "D:\\TEMP\\_output\\_tif\\_Manning_Invasives\\Struc_Spec_1cm_near_fill.tif"

In [58]:
# Fucntion for extractiong raster information for points

def deriveMinSamples(minPolys, rasPath):

    with rasterio.open(rasPath, 'r') as src: # Open rasterio object
        array = src.read() # Numpy ndarray

        xIn = minPolys['geometry'].x # extract x coords
        yIn = minPolys['geometry'].y # extract y coords
        coords = [(x, y) for x, y in zip(xIn, yIn)] # put x,y coords paired in tuples
        valDf = pd.DataFrame(list(src.sample(coords)), columns=['1_red','1_green','1_blue','2_blue','2_green','3_red','4_re','2_nir'])
        
        samplesDf = pd.concat([minPolys.reset_index(inplace=False), valDf.reset_index(inplace=False)], axis=1)
    
    return(samplesDf)

In [61]:
# Merge geodataframes 
minorityDf = deriveMinSamples(gdfMinority, rasPath)  

### Train-Val split of present-class samples using Quadrats

In [228]:
# select by quadrat

def split_t_v(df, species, random_state, test_size=0.25):
    df = df.loc[df['Class']==species]

    out = []

    for i in df['Quadrat'].unique():
        rows = df.loc[df['Quadrat']==i]

        groupbydf = rows.groupby('Class').count()
        groupbydf = groupbydf.loc[groupbydf['Quadrat'] >= 2]
        classNames = list(groupbydf.index)

        rows2 = rows.loc[rows['Class'].isin(classNames)]
              
        # Split DataFrame using sklearn
        X_tr, X_t, y_tr, y_t = train_test_split((rows2), (rows2.loc[:,'Class']), 
                                                test_size=test_size, 
                                                random_state=random_state)

        # Add columns
        X_tr['split'] = 'train'
        X_tr['class'] = species
        X_tr = X_tr.reset_index(drop=True).drop('index', axis=1)
        
        X_t['split'] = 'test'
        X_t['class'] = species
        X_t = X_t.reset_index(drop=True).drop('index', axis=1)
        
        out.append(pd.concat([X_tr, X_t], axis=0))
        
    out = pd.concat(out, axis=0)

    return(out)

In [233]:
# Run split train-test

def run_split_t_v(df, species, outFn):
    for i in [4,7,9,19,54,83,101,103,115,120]:
        samplesDf = split_t_v(minorityDf, species=species, random_state=i)
        samplesDf.drop(columns=['Join_Count','TARGET_FID','Id'], inplace=True)
        samplesDf.to_file(outFn.format(species, str(i)))

In [234]:
# creates 10 independed randomly split point datasets by species
run_split_t_v(minorityDf, 'T', "E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_samples\\_minority_sp{}_seed{}.shp")

run_split_t_v(minorityDf, 'H', "E:\\Sync\\_Documents\\_Letter_invasives\\_Data\\_samples\\_minority_sp{}_seed{}.shp")