## Project

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import xarray as xr
from netCDF4 import Dataset
import tempfile
from scipy.spatial.distance import cdist
from tqdm import tqdm

### 1. Data Collection

#### 1.1 Biomass data

Spatio-temporal data for walleye pollock in the Gulf of Alaska extracted from the NOAA Fisheries website: https://apps-st.fisheries.noaa.gov/dismap/DisMAP.html#single-species-distributions

In [2]:
wp_gulf_alaska = pd.read_csv("wp_gulf_alaska.csv")

In [3]:
wp_gulf_alaska.head()

Unnamed: 0,HaulID,Stratum,LAT,LON,Depth,Year,wtcpue
0,021-198403-001,350,55.98767,-134.59517,346,1984,0.0
1,021-198403-002,251,55.632,-134.229,225,1984,4.7482
2,021-198403-003,151,55.2145,-133.88733,117,1984,1.0413
3,021-198403-006,151,55.133,-133.871,141,1984,1.345
4,021-198403-007,151,55.1255,-134.00167,170,1984,0.203


Columns:
- HaulID / Stratum: characterizes the survey expedition and catch throw.
- LAT / LON: Latitude and Longitude of survey catch
- Depth: Sea depth measured in meters below sea level.
- Year: No other timestamp is provided. However, documentation specify that all surveys have been conducted during summer months.
- wtcpue: survey catch measured in kg per ha.


#### 1.2 Oceanographic data

We web-scrape temperaturate and salinaty geospatial data from HyCOM (Hybrid Coordinate Ocean Model) using their NCSS request URL. Since there is too much data, we have decided to only get a sample from the first day of each summer month (July 1st, August 1st and September 1st). Of course, including more points should improve our model. Then again, we don't have a timestamp associated with each survey, just the year.

We will extract data from this geographic region:
- North limit: 60.32
- South limit: 52.41
- West limit: -170
- East limit: -132.5

Unfortunately, we do not have data prior to 1995. 

In [4]:
# Define the list of years and months
year_list = [1996, 1999, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017, 2019]
month_list = [7, 8, 9]
this_folder = os.getcwd()
# Create an empty DataFrame to store the results
df_all = pd.DataFrame()

# Loop over each desired year and month, and add a progress bar
for year in year_list:
    for month in tqdm(month_list, desc=f'{year}'):
        # Define the date string
        date_str = f'{year}-{month:02d}-01T09:00:00Z'
        # Define the URL with the fixed parameters
        if year <=2015:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_53.X/data/"+str(year)+"?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        elif year == 2017:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_57.7?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        else:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_93.0/ts3z?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        
        my_file = os.path.join(this_folder, 'temp', f'example-{date_str[:10]}.nc4')
        # Add the date parameter to the URL
        url_day = f'{url}&time={date_str}'
        # Download the data and save it to a file
        downloaded_obj = requests.get(url_day)
        with open(my_file, "wb") as file:
            file.write(downloaded_obj.content)
            del downloaded_obj
        # Open the NetCDF file and convert it to a pandas DataFrame
        ds = xr.open_dataset(my_file)
        df_temp = ds.to_dataframe().reset_index()
        # Append the DataFrame to the overall DataFrame
        df_all = pd.concat([df_all, df_temp], ignore_index=True)

1996: 100% 3/3 [00:06<00:00,  2.22s/it]
1999: 100% 3/3 [00:12<00:00,  4.02s/it]
2003: 100% 3/3 [00:03<00:00,  1.33s/it]
2005: 100% 3/3 [00:03<00:00,  1.18s/it]
2007: 100% 3/3 [00:04<00:00,  1.41s/it]
2009: 100% 3/3 [00:09<00:00,  3.08s/it]
2011: 100% 3/3 [00:09<00:00,  3.30s/it]
2013: 100% 3/3 [00:10<00:00,  3.48s/it]
2015: 100% 3/3 [00:11<00:00,  3.86s/it]
2017: 100% 3/3 [00:01<00:00,  1.57it/s]
2019: 100% 3/3 [01:16<00:00, 25.48s/it]


In [5]:
df_all["LAT_rounded"] = round(df_all["lat"], 2)
df_all["LON_rounded"] = round(df_all["lon"], 2)
df_all["Year"] = df_all['time'].dt.strftime('%Y')

In [6]:
df_mean = df_all.groupby(['LAT_rounded', 'LON_rounded', 'Year'])[['salinity_bottom', 'water_temp_bottom']].mean().reset_index()
df_mean['Year'] = df_mean['Year'].astype('int64')
df_mean.head()

Unnamed: 0,LAT_rounded,LON_rounded,Year,salinity_bottom,water_temp_bottom
0,52.4,-170.0,1996,34.245998,3.425332
1,52.4,-170.0,1999,34.206333,3.580333
2,52.4,-170.0,2003,34.135334,3.458333
3,52.4,-170.0,2005,34.184338,3.537999
4,52.4,-170.0,2007,34.262669,3.203666


In [7]:
df_mean.to_csv('oceanographic_data.csv')  

#### 1.3 Merging the two data sets

In [8]:
wp_rounded = wp_gulf_alaska.copy()
wp_rounded["LAT_rounded"] = round(wp_rounded["LAT"], 2)
wp_rounded["LON_rounded"] = round(wp_rounded["LON"], 2)
wp_rounded['Year'] = wp_rounded['Year'].astype('int64')
wp_rounded.head()

Unnamed: 0,HaulID,Stratum,LAT,LON,Depth,Year,wtcpue,LAT_rounded,LON_rounded
0,021-198403-001,350,55.98767,-134.59517,346,1984,0.0,55.99,-134.6
1,021-198403-002,251,55.632,-134.229,225,1984,4.7482,55.63,-134.23
2,021-198403-003,151,55.2145,-133.88733,117,1984,1.0413,55.21,-133.89
3,021-198403-006,151,55.133,-133.871,141,1984,1.345,55.13,-133.87
4,021-198403-007,151,55.1255,-134.00167,170,1984,0.203,55.13,-134.0


In [9]:
#df_joined = wp_rounded.merge(df_mean, on=['LAT_rounded', 'LON_rounded', 'Year'])
#df_joined