In [1]:
# Supress Warnings 
import warnings
warnings.filterwarnings('ignore')

# Plotting
import matplotlib.pyplot as plt

# Data science
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold

# Geospatial
import contextily as cx
from shapely.geometry import Point, Polygon
import xarray as xr
import rasterio.features
import rasterio as rio

# API
import requests
import json

# Import Planetary Computer
import fsspec
import stackstac
import pystac
import pystac_client
import planetary_computer

# Other
import os
from itertools import cycle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [2]:
dataset_path = "training_data/occurrence.txt"
country = "AU"

In [4]:
def main():
    # Sort acc to country -> Australia
    dataset = get_Dataset()

    # ecological features from planetary database
    # final_dataset acts as x_test
    final_dataset = getFinalFeatures(dataset.longitude, dataset.latitude)

    # predictions
    OccurPredictions = model.predict(final_dataset)

    # Plot results
    # 1. Probability occurences
    # 2. Species occurrences according to geographical features
    plot_prob(OccurPredictions, final_dataset)
    plot_graphs(OccurPredictions, final_dataset)


    


In [3]:
df = pd.read_csv(dataset_path, sep='\t', parse_dates=['eventDate'])

In [4]:
df.species.value_counts()

Crinia signifera             126657
Litoria fallax                47332
Crinia glauerti                9393
Ranoidea australis             4097
Agalychnis callidryas          2212
Dendrobates auratus            1718
Xenopus laevis                 1139
Chiromantis xerampelina         702
Austrochaperina pluvialis       541
Name: species, dtype: int64

In [5]:
def get_species(dataset_path, year_range=(2000,2022), bbox=None, chosenCountry="AU"):
    """Returns the dataframe of all frog occurrences for the bounding box specified."""
    columns = [
        'eventDate','countryCode','decimalLatitude','decimalLongitude','species'
    ]
    species = pd.read_csv(dataset_path, sep='\t', parse_dates=['eventDate'])
    species = species[species['countryCode'] == chosenCountry][columns]
    species = species[lambda x: 
            (x.eventDate.dt.year >= year_range[0]) & 
            (x.eventDate.dt.year <= year_range[1])
        ]
    print("Shape of occurrence dataset: ",species.shape)
    return species

In [15]:
def get_terraclimate(bbox, metrics, time_slice=('2000-01-01','2022-02-01'), assets=None, features=None, interp_dims=(1024,1024), verbose=True):
    """Returns terraclimate metrics for a given area, allowing results to be interpolated onto a larger image.
    
    Attributes:
    bbox -- Tuple of (min_lon, min_lat, max_lon, max_lat) to define area
    metrics -- Nested dictionary in the form {<metric_name>:{'fn':<metric_function>,'params':<metric_kwargs_dict>}, ... }
    time_slice -- Tuple of datetime strings to select data between, e.g. ('2015-01-01','2019-12-31')
    assets -- list of terraclimate assets to take
    features -- list of asset metrics to take, specified by strings in the form '<asset_name>_<metric_name>'
    interp_dims -- Tuple of dimensions (n, m) to interpolate results to
    """
    min_lon, min_lat, max_lon, max_lat = bbox
    
    collection = pystac.read_file("https://planetarycomputer.microsoft.com/api/stac/v1/collections/terraclimate")
    asset = collection.assets["zarr-https"]
    store = fsspec.get_mapper(asset.href)
    data = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])
    print(data)
    
    # Select datapoints that overlap region
    if time_slice is not None:
        data = data.sel(lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat),time=slice(time_slice[0],time_slice[1]))
    else:
        data = data.sel(lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat))
    if assets is not None:
        data = data[assets]
    
    print('Loading data') if verbose else None
    data = data.rename(lat='y', lon='x').to_array().compute()
        
    # Calculate metrics
    combined_values = []
    combined_bands = []
    for name, metric in metrics.items():
        print(f'Calculating {name}') if verbose else None
        sum_data = xr.apply_ufunc(
            metric['fn'], data, input_core_dims=[["time"]], kwargs=metric['params'], dask = 'allowed', vectorize = True
        ).rename(variable='band')
        xcoords = sum_data.x
        ycoords = sum_data.y
        dims = sum_data.dims
        combined_values.append(sum_data.values)
        for band in sum_data.band.values:
            combined_bands.append(band+'_'+name)
        
    # Combine metrics
    combined_values = np.concatenate(
        combined_values,
        axis=0
    )
    combined_data = xr.DataArray(
        data=combined_values,
        dims=dims,
        coords=dict(
            band=combined_bands,
            y=ycoords,
            x=xcoords
        )
    )    

    # Take relevant bands:
    combined_data = combined_data.sel(band=features)
    print(combined_data.shape)
    print(combined_data.head)
    
    if interp_dims is not None:
        print(f'Interpolating image') if verbose else None
        interp_coords = (np.linspace(bbox[0], bbox[2], interp_dims[0]), np.linspace(bbox[1], bbox[3], interp_dims[1]))
        combined_data = combined_data.interp(x=interp_coords[0], y=interp_coords[1], method='nearest', kwargs={"fill_value": "extrapolate"})
    
    return combined_data

In [6]:
speciesData = get_species(dataset_path)

target_species = "Litoria fallax"

# CREATE NEW TARGET CLASS WHICH HAS BINARY VALUES

speciesData['key'] = [i for i in range(speciesData.shape[0])]
min_lat = speciesData.decimalLatitude.min()
max_lat = speciesData.decimalLatitude.max()
min_lon = speciesData.decimalLongitude.min()
max_lon = speciesData.decimalLongitude.max()

bbox = (min_lon, min_lat, max_lon, max_lat)
speciesData.head()

Shape of occurrence dataset:  (148379, 5)


Unnamed: 0,eventDate,countryCode,decimalLatitude,decimalLongitude,species,key
0,2020-01-23 01:38:00,AU,-32.719457,152.159267,Litoria fallax,0
4,2021-10-29 13:57:00,AU,-26.714302,152.815096,Litoria fallax,1
7,2018-04-18 19:05:00,AU,-33.693144,151.320884,Litoria fallax,2
8,2009-07-28 17:16:00,AU,-27.888019,153.309342,Litoria fallax,3
16,2021-05-05 10:56:00,AU,-35.208964,138.480985,Crinia signifera,4


In [7]:
speciesData.species.value_counts()

Crinia signifera             94848
Litoria fallax               43199
Crinia glauerti               7984
Ranoidea australis            2100
Austrochaperina pluvialis      248
Name: species, dtype: int64

In [9]:
onehot = pd.get_dummies(speciesData.species)
df = pd.concat([speciesData, onehot], axis=1)
df.head()

Unnamed: 0,eventDate,countryCode,decimalLatitude,decimalLongitude,species,key,Austrochaperina pluvialis,Crinia glauerti,Crinia signifera,Litoria fallax,Ranoidea australis
0,2020-01-23 01:38:00,AU,-32.719457,152.159267,Litoria fallax,0,0,0,0,1,0
4,2021-10-29 13:57:00,AU,-26.714302,152.815096,Litoria fallax,1,0,0,0,1,0
7,2018-04-18 19:05:00,AU,-33.693144,151.320884,Litoria fallax,2,0,0,0,1,0
8,2009-07-28 17:16:00,AU,-27.888019,153.309342,Litoria fallax,3,0,0,0,1,0
16,2021-05-05 10:56:00,AU,-35.208964,138.480985,Crinia signifera,4,0,0,1,0,0


In [10]:
weather_data = xr.load_dataarray('assets/weatherData_Aus.nc')

In [52]:
# Metrics to measure over time dimension
tc_metrics = {
    'mean':{
        'fn':np.nanmean,
        'params':{}
    },
    'min':{
        'fn':np.nanmin,
        'params':{}
    },
    'max':{
        'fn':np.nanmax,
        'params':{}
    }
}

# Measurements to take
assets=['tmax', 'tmin', 'ppt', 'soil']

# Features to take, in form '<asset>_<metric>'
features=['tmax_mean', 'tmin_mean', 'ppt_mean', 'soil_mean']

weather_data = get_terraclimate(bbox, tc_metrics, assets=assets, features=features)
display(weather_data.band.values)

<xarray.Dataset>
Dimensions:                 (time: 744, lat: 4320, lon: 8640, crs: 1)
Coordinates:
  * crs                     (crs) int16 3
  * lat                     (lat) float64 89.98 89.94 89.9 ... -89.94 -89.98
  * lon                     (lon) float64 -180.0 -179.9 -179.9 ... 179.9 180.0
  * time                    (time) datetime64[ns] 1958-01-01 ... 2019-12-01
Data variables: (12/18)
    aet                     (time, lat, lon) float32 dask.array<chunksize=(12, 1440, 1440), meta=np.ndarray>
    def                     (time, lat, lon) float32 dask.array<chunksize=(12, 1440, 1440), meta=np.ndarray>
    pdsi                    (time, lat, lon) float32 dask.array<chunksize=(12, 1440, 1440), meta=np.ndarray>
    pet                     (time, lat, lon) float32 dask.array<chunksize=(12, 1440, 1440), meta=np.ndarray>
    ppt                     (time, lat, lon) float32 dask.array<chunksize=(12, 1440, 1440), meta=np.ndarray>
    ppt_station_influence   (time, lat, lon) float32 dask

KeyboardInterrupt: 

In [11]:
def join_species(speciesData, data):
    """Collects the data for each frog location and joins it onto the frog data 

    Arguments:
    speciesData -- dataframe containing the response variable along with ["decimalLongitude", "decimalLatitude", "key"]
    data -- xarray dataarray of features, indexed with geocoordinates
    """
    return speciesData.merge(
        (
            data
            .rename('data')
            .sel(
                x=xr.DataArray(speciesData.decimalLongitude, dims="key", coords={"key": speciesData.key}), 
                y=xr.DataArray(speciesData.decimalLatitude, dims="key", coords={"key": speciesData.key}),
                method="nearest"
            )
            .to_dataframe()
            .assign(val = lambda x: x.iloc[:, -1])
            [['val']]
            .reset_index()
            .drop_duplicates()
            .pivot(index="key", columns="band", values="val")
            .reset_index()
        ),
        on = ['key'],
        how = 'inner'
    )
    
model_data = join_species(df, weather_data)
model_data = model_data
model_data.head()

Unnamed: 0,eventDate,countryCode,decimalLatitude,decimalLongitude,species,key,Austrochaperina pluvialis,Crinia glauerti,Crinia signifera,Litoria fallax,Ranoidea australis,ppt_mean,soil_mean,tmax_mean,tmin_mean
0,2020-01-23 01:38:00,AU,-32.719457,152.159267,Litoria fallax,0,0,0,0,1,0,102.10833,136.166672,23.354586,13.663752
1,2021-10-29 13:57:00,AU,-26.714302,152.815096,Litoria fallax,1,0,0,0,1,0,103.104164,129.78334,24.437502,13.463335
2,2018-04-18 19:05:00,AU,-33.693144,151.320884,Litoria fallax,2,0,0,0,1,0,97.5625,79.508331,23.269585,13.485835
3,2009-07-28 17:16:00,AU,-27.888019,153.309342,Litoria fallax,3,0,0,0,1,0,98.904167,90.849998,25.787504,15.002085
4,2021-05-05 10:56:00,AU,-35.208964,138.480985,Crinia signifera,4,0,0,1,0,0,37.445835,11.35,21.426668,11.767502


In [13]:
model_data.to_csv("assets/finalDataset_AllFrogs.csv")
# model_data = pd.read_csv("assets/finalDataset_AllFrogs.csv")

In [28]:
s = ['Austrochaperina pluvialis','Crinia glauerti','Crinia signifera','Litoria fallax','Ranoidea australis']
print(model_data.isna().sum())
model_data = model_data.dropna()
# Separate the predictor variables from the response
X = (
    model_data
    .drop(['eventDate', 'decimalLatitude', 'decimalLongitude', 'species',
       'countryCode', 'key'], 1)
)
X.drop(s,1,inplace=True)
y = model_data[s]
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

eventDate                    0
countryCode                  0
decimalLatitude              0
decimalLongitude             0
species                      0
key                          0
Austrochaperina pluvialis    0
Crinia glauerti              0
Crinia signifera             0
Litoria fallax               0
Ranoidea australis           0
ppt_mean                     0
soil_mean                    0
tmax_mean                    0
tmin_mean                    0
dtype: int64


In [4]:
x_test.to_csv("assets/frogTest.csv")

Random Forest classifier

In [18]:
rf = RandomForestClassifier().fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Accuracy: ",accuracy_score(y_pred, y_test))
# print(f"F1 Score: ",f1_score(y_pred, y_test))

Accuracy:  0.8458215201836101


In [23]:
y_test = y_test.iloc[:].values
y_test

array([[0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]], dtype=uint8)

In [26]:
y_pred

array([[0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]], dtype=uint8)

In [37]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
cm

array([[   44,     0,     0,     7,     0],
       [    1,  1614,     3,     0,     0],
       [    1,     0, 16325,  2599,     0],
       [   11,     0,  1946,  6654,     0],
       [    0,     0,     0,     0,   423]], dtype=int64)

In [19]:
# Visualise the results in a confusion matrix
disp = ConfusionMatrixDisplay.from_estimator(rf, x_test, y_test, display_labels=['Absent', 'Present'], cmap='Blues')
disp.figure_.set_size_inches((7, 7))
disp.ax_.set_title('Random Forest Classifier')
plt.show()

ValueError: multilabel-indicator is not supported

In [38]:
import pickle
pickle.dump(rf,open("assets/randomforest_allfrog.pkl", "wb"))