# *WORKING file:* Species abundance model
### Description:
Create a tutorial for a marine species abundance model in python. This notebook was created at the PACE Hackweek 2025. Our use-case will be with [NOAA NEFSC bottom trawl](https://www.fisheries.noaa.gov/new-england-mid-atlantic/science-data/2025-spring-bottom-trawl-survey-completed-northeast) data, specifically longfin squid.


![longfin squid!](https://img.freepik.com/free-vector/hand-drawn-squid-illustration_23-2149560574.jpg?semt=ais_hybrid&w=740&q=80)
### Authors: 
* Haley Synan (NOAA Fisheries/IBSS)
* Artem Dzhulai (URI)
* Sajna Hussain
* Natalie McCourt (UMBC)
### History:
>*8/4/25: notebook initialized*

# STEPS
* [x] Get fisheries data
* [ ] Choose explanatory variables
* [ ] Match data
* [ ] Run model (xgboost) 

In [1]:
import datetime
import os
import re
from pathlib import Path

import earthaccess
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr
from matplotlib.ticker import FuncFormatter
from scipy import odr, stats

In [2]:
# Get home directory
os.getcwd()

'/home/jovyan/artem/proj_2025_sdm/contributors/artem'

In [3]:
# Load Fisheries Data
df = pd.read_csv('/home/jovyan/artem/proj_2025_sdm/data/fisheries_with_pace_rrs_avw2.csv')
df.head(2)

Unnamed: 0,TOWDATETIME_EST,LAT,LON,MEAN_DEPTH,SWEPT_AREA_km,acadian redfish,alewife,alligatorfish,american lobster,american plaice,...,Rrs_707,Rrs_708,Rrs_709,Rrs_711,Rrs_712,Rrs_713,Rrs_714,Rrs_717,Rrs_719,Rrs_brightness
0,2024-03-07 10:58:00,38.659194,-74.828083,24,0.021282,0.0,0.0,0.0,0.0,0.0,...,0.000851,0.000825,0.000797,0.000765,0.000738,0.000717,0.000689,0.000552,0.00054,1.090205
1,2024-03-07 14:32:00,38.498596,-74.477395,40,0.024311,0.0,0.0,0.0,0.0,0.0,...,0.000386,0.000378,0.000364,0.000344,0.000333,0.000325,0.000315,0.000229,0.000261,0.907649


In [4]:
auth = earthaccess.login()

In [5]:
tspan = ("2024-03-07", "2024-05-13")
bbox = (-76.75, 33.00, -63.00, 46.00) # west south east north

In [7]:
# Search for OCI Chl data
results = earthaccess.search_data(
    short_name="PACE_OCI_L3M_CHL",
    granule_name="*.DAY.*0p1deg*",
    #granule_name="*.8.*4km*",
    bounding_box=bbox,
    temporal=tspan
)
len(results)

64

In [8]:
paths = earthaccess.open(results)
paths

QUEUEING TASKS | :   0%|          | 0/64 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/64 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/64 [00:00<?, ?it/s]

[<File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240307.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240308.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240309.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240310.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240311.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240312.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240315.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240316.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 <File-like object S3FileSystem, ob-cumulus-prod-public/PACE_OCI.20240317.L3m.DAY.CHL.V3_0.chlor_a.0p1deg.nc>,
 

In [11]:
dataset_PACE = xr.open_mfdataset(paths, combine="nested", concat_dim="date")
dataset_PACE

Unnamed: 0,Array,Chunk
Bytes,1.54 GiB,2.00 MiB
Shape,"(64, 1800, 3600)","(1, 512, 1024)"
Dask graph,1024 chunks in 193 graph layers,1024 chunks in 193 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.54 GiB 2.00 MiB Shape (64, 1800, 3600) (1, 512, 1024) Dask graph 1024 chunks in 193 graph layers Data type float32 numpy.ndarray",3600  1800  64,

Unnamed: 0,Array,Chunk
Bytes,1.54 GiB,2.00 MiB
Shape,"(64, 1800, 3600)","(1, 512, 1024)"
Dask graph,1024 chunks in 193 graph layers,1024 chunks in 193 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.00 kiB,768 B
Shape,"(64, 3, 256)","(1, 3, 256)"
Dask graph,64 chunks in 193 graph layers,64 chunks in 193 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 48.00 kiB 768 B Shape (64, 3, 256) (1, 3, 256) Dask graph 64 chunks in 193 graph layers Data type uint8 numpy.ndarray",256  3  64,

Unnamed: 0,Array,Chunk
Bytes,48.00 kiB,768 B
Shape,"(64, 3, 256)","(1, 3, 256)"
Dask graph,64 chunks in 193 graph layers,64 chunks in 193 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray


In [12]:
var = 'chlor_a'

def match_nearest(df, ds, var, new_name=var, date=None):
    """
    PURPOSE: 
        Loop through each row of a dataframe and find the nearest neighbor matchup from a xarray dataset
    INPUTS:
        df (pandas dataframe): dataframe (base coordinates to match to)
        ds (xarray dataset): xarray dataset to extract data from
        var (str): variable name that you are matching (ex. chla or avw or Rrs_665)
        new_name (str): Optional. variable name to add to original dataframe (ex chla_PACE). Default is original variable name
        date (): Optional. Dates from the dataframe. NOTE the dtype of dataframe date and dtype of xarray date must be the same! 
    RETURNS:
        df (pandas dataframe): original dataframe updated with a new column (new_name)
    HISTORY:
        2024: originally written (adapted from turtle track code)
        8/5/25: updated for PACE HACKWEEK
    """  
    try:
        df = df.rename(columns={'LAT':'lat','LON':'lon'})
        ds = ds
    except:
        pass
    try:
        d = []
        for i in range(0, len(df)):
            # Crop the dataset to include data that corresponds to track locations
            cropped_ds = ds[var].sel(time=df.date[i],
                                           lat=df.lat[i],
                                           lon=df.lon[i],
                                           method='nearest'
                                           )
            d.append(cropped_ds.values)
        df.insert(0,new_name,d)
        return df
    except:
        d = []
        for i in range(0, len(df)):
            # Crop the dataset to include data that corresponds to track locations
            cropped_ds = ds[var].sel(lat=df.lat[i],
                                           lon=df.lon[i],
                                           method='nearest'
                                           )
            d.append(cropped_ds.values)
        df.insert(0,new_name,d)
        return df

In [13]:
d8=[]
for file in paths: 
    d = xr.open_dataset(file)
    d8.append(d.attrs['time_coverage_start'])

ds = xr.open_mfdataset(paths, combine='nested',concat_dim='datetime').assign_coords({'time':d8})
ds = ds.rename({'datetime':'time'})
ds = ds.where((ds.lat > 34.40918) & (ds.lat < 46.362305) & (-63>ds.lon) & (-77< ds.lon),drop=True)
ds['time']=[pd.to_datetime(d) for d in ds.time.values]
df['time'] = [pd.to_datetime(d.replace(' ','T')) for d in df.TOWDATETIME_EST]
ds['time'] = [pd.to_datetime(time.values) for time in ds.time]

var = 'chlor_a'

match_nearest(df, ds, var, 'chlor_a', df.time)

Unnamed: 0,chlor_a,TOWDATETIME_EST,lat,lon,MEAN_DEPTH,SWEPT_AREA_km,acadian redfish,alewife,alligatorfish,american lobster,...,Rrs_708,Rrs_709,Rrs_711,Rrs_712,Rrs_713,Rrs_714,Rrs_717,Rrs_719,Rrs_brightness,time
0,"[nan, nan, nan, 2.5567453, nan, nan, nan, 3.30...",2024-03-07 10:58:00,38.659194,-74.828083,24,0.021282,0.0,0.0,0.0,0.0,...,0.000825,0.000797,0.000765,0.000738,0.000717,0.000689,0.000552,0.000540,1.090205,2024-03-07 10:58:00
1,"[nan, nan, nan, nan, nan, nan, nan, 1.060982, ...",2024-03-07 14:32:00,38.498596,-74.477395,40,0.024311,0.0,0.0,0.0,0.0,...,0.000378,0.000364,0.000344,0.000333,0.000325,0.000315,0.000229,0.000261,0.907649,2024-03-07 14:32:00
2,"[nan, nan, nan, nan, nan, nan, nan, 1.0083534,...",2024-03-07 17:44:00,38.411974,-74.502885,39,0.023835,0.0,0.0,0.0,0.0,...,0.000364,0.000350,0.000330,0.000318,0.000310,0.000292,0.000210,0.000254,0.935354,2024-03-07 17:44:00
3,"[nan, nan, nan, nan, nan, nan, nan, 2.1293998,...",2024-03-07 20:29:00,38.505238,-74.832046,25,0.021510,0.0,0.0,0.0,1.0,...,0.000653,0.000632,0.000606,0.000588,0.000574,0.000553,0.000430,0.000434,1.050127,2024-03-07 20:29:00
4,"[nan, nan, nan, nan, nan, nan, nan, 2.1293998,...",2024-03-07 22:45:00,38.556784,-74.897495,21,0.019174,0.0,0.0,0.0,27.0,...,0.001268,0.001221,0.001175,0.001134,0.001093,0.001046,0.000848,0.000791,1.327782,2024-03-07 22:45:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",2024-05-12 12:30:00,41.305711,-70.636614,23,0.018435,0.0,0.0,0.0,0.0,...,0.000275,0.000266,0.000253,0.000246,0.000243,0.000237,0.000181,0.000218,0.529729,2024-05-12 12:30:00
364,"[nan, 0.9144166, nan, 1.6463482, 1.4178369, na...",2024-05-12 18:45:00,40.467061,-71.381195,73,0.022568,0.0,2.0,0.0,2.0,...,0.000155,0.000150,0.000142,0.000140,0.000137,0.000138,0.000097,0.000150,0.446814,2024-05-12 18:45:00
365,"[nan, nan, nan, 0.7010292, 1.2878964, nan, nan...",2024-05-13 01:19:00,39.618178,-72.027289,219,0.024705,0.0,0.0,0.0,0.0,...,0.000104,0.000102,0.000094,0.000093,0.000094,0.000095,0.000057,0.000104,0.450264,2024-05-13 01:19:00
366,"[nan, nan, nan, 1.0399542, 1.1839269, nan, nan...",2024-05-13 03:42:00,39.762143,-72.213598,98,0.020737,0.0,1.0,0.0,0.0,...,0.000143,0.000141,0.000134,0.000134,0.000133,0.000130,0.000068,0.000124,0.446797,2024-05-13 03:42:00
