## Project

In [3]:
import pandas as pd
import numpy as np
import requests
import os
import xarray as xr
from netCDF4 import Dataset
import tempfile
from scipy.spatial.distance import cdist
from tqdm import tqdm
from math import radians, cos, sin, asin, sqrt

### 1. Data Collection

#### 1.1 Biomass data

Spatio-temporal data for walleye pollock in the Gulf of Alaska extracted from the NOAA Fisheries website: https://apps-st.fisheries.noaa.gov/dismap/DisMAP.html#single-species-distributions

In [12]:
wp_gulf_alaska = pd.read_csv("wp_gulf_alaska.csv")
wp_gulf_alaska['Year'] = wp_gulf_alaska['Year'].astype('int64')
wp_gulf_alaska.head()

Unnamed: 0,HaulID,Stratum,LAT,LON,Depth,Year,wtcpue
0,021-198403-001,350,55.98767,-134.59517,346,1984,0.0
1,021-198403-002,251,55.632,-134.229,225,1984,4.7482
2,021-198403-003,151,55.2145,-133.88733,117,1984,1.0413
3,021-198403-006,151,55.133,-133.871,141,1984,1.345
4,021-198403-007,151,55.1255,-134.00167,170,1984,0.203


Columns:
- HaulID / Stratum: characterizes the survey expedition and catch throw.
- LAT / LON: Latitude and Longitude of survey catch
- Depth: Sea depth measured in meters below sea level.
- Year: No other timestamp is provided. However, documentation specify that all surveys have been conducted during summer months.
- wtcpue: survey catch measured in kg per ha.


#### 1.2 Oceanographic data

We web-scrape temperature and salinity geospatial data from HyCOM (Hybrid Coordinate Ocean Model) using their NCSS request URL. Since there is too much data, we have decided to only get a sample from the first day of each summer month (July 1st, August 1st and September 1st). Of course, including more points should improve our model. Then again, we don't have a timestamp associated with each survey, just the year.

We will extract data from this geographic region:
- North limit: 60.32
- South limit: 52.41
- West limit: -170
- East limit: -132.5

Unfortunately, we do not have data prior to 1995. 

### Note: to run the below, create a temp folder in the project working directory

In [5]:
# Define the list of years and months
year_list = [1996, 1999, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017, 2019]
month_list = [7, 8, 9]
this_folder = os.getcwd()
data_folder = this_folder + '\\temp'

# create data folder if it doesn't exists 
if not os.path.exists(data_folder):
    os.mkdir(data_folder)

# Create an empty DataFrame to store the results
df_all = pd.DataFrame()

# Loop over each desired year and month, and add a progress bar
for year in year_list:
    for month in tqdm(month_list, desc=f'{year}'):
        # Define the date string
        date_str = f'{year}-{month:02d}-01T09:00:00Z'
        # Define the URL with the fixed parameters
        if year <=2015:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_53.X/data/"+str(year)+"?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        elif year == 2017:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_57.7?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        else:
            url = "https://ncss.hycom.org/thredds/ncss/GLBv0.08/expt_93.0/ts3z?var=salinity_bottom&var=water_temp_bottom&north=60.32&west=-170&east=-132.5&south=52.41&horizStride=1&vertCoord=&accept=netcdf4"
        
        my_file = os.path.join(this_folder, 'temp', f'example-{date_str[:10]}.nc4')
        # Add the date parameter to the URL
        url_day = f'{url}&time={date_str}'
        # Download the data and save it to a file
        downloaded_obj = requests.get(url_day)
        with open(my_file, "wb") as file:
            file.write(downloaded_obj.content)
            del downloaded_obj
        # Open the NetCDF file and convert it to a pandas DataFrame
        ds = xr.open_dataset(my_file)
        df_temp = ds.to_dataframe().reset_index()
        # Append the DataFrame to the overall DataFrame
        df_all = pd.concat([df_all, df_temp], ignore_index=True)

1996: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [01:59<00:00, 39.81s/it]
1999: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.56s/it]
2003: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.27s/it]
2005: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.27s/it]
2007: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:30<00:00, 10.25s/it]
2009: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:45<00:00, 15.15s/it]
2011: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00,  4.09s/it]
2013: 100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [00:14<00:00,  4.77s/it]
2015: 100%|█████████████████████████████

In [8]:
df_all["Year"] = df_all['time'].dt.strftime('%Y')

In [10]:
df_mean = (df_all
           .groupby(['lat', 'lon', 'Year'])
           [['salinity_bottom', 'water_temp_bottom']]
           .mean()
           .reset_index())
df_mean['Year'] = df_mean['Year'].astype('int64')
df_mean.head()

Unnamed: 0,lat,lon,Year,salinity_bottom,water_temp_bottom
0,52.400002,-170.0,1996,34.245998,3.425332
1,52.400002,-170.0,1999,34.206333,3.580333
2,52.400002,-170.0,2003,34.135334,3.458333
3,52.400002,-170.0,2005,34.184338,3.537999
4,52.400002,-170.0,2007,34.262669,3.203666


#### 1.3 Merging the two data sets

In [24]:
def dist(lat1, long1, lat2, long2):
    """
    https://medium.com/analytics-vidhya/finding-nearest-pair-of-latitude-and-longitude-match-using-python-ce50d62af546
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

def find_nearest(lat, long, df, target):
    distances = df.apply(lambda row: dist(lat, long, row['lat'], row['lon']), axis=1)
    return df.loc[distances.idxmin(), target]

In [26]:
# test with a subset
wp_19 = wp_rounded[wp_rounded['Year'] == 2019]
df_mean_19 = df_mean[df_mean['Year'] == 2019]

In [27]:
# cell not running
wp_19['salinity_bottom'] = wp_19.apply(lambda row: find_nearest(row['LAT'], row['LON'], df_mean_19, 'salinity_bottom'), axis=1)
wp_19['water_temp_bottom'] = wp_19.apply(lambda row: find_nearest(row['LAT'], row['LON'], df_mean_19, 'water_temp_bottom'), axis=1)

wp_19.head()

KeyboardInterrupt: 