# Carbon and Plankton CorrelationThis notebook combines the carbon observations explored in `carbon_data_exploration.ipynb` with the plankton observations from `jellyfish_plankton_data_exploration.ipynb`.We will:1. Load and clean the plankton observation dataset.2. Derive the shared spatio-temporal domain covered by the plankton data.3. Query the GLODAPv2 carbon dataset (via ERDDAP) over that domain.4. Sample the carbon fields at each plankton observation.5. Calculate correlation statistics and visualize the relationships.

In [1]:
import pandas as pd

import numpy as np

import xarray as xr

from erddapy import ERDDAP

import matplotlib.pyplot as plt

import seaborn as sns


## 1. Load and tidy the plankton observationsWe reuse the same source file handled in the plankton exploration notebook.

In [2]:
plankton_raw = (
    pd.read_csv('../data/JeDI.csv')
      .replace('nd', pd.NA)
)

numeric_cols = [
    'year', 'month', 'day', 'lat', 'lon',
    'count_actual', 'density', 'density_integrated',
    'biovolume', 'biovolume_integrated',
    'weight_wet', 'weight_dry'
]

for col in numeric_cols:
    if col in plankton_raw.columns:
        plankton_raw[col] = pd.to_numeric(plankton_raw[col], errors='coerce')

plankton = plankton_raw.copy()

plankton['time'] = pd.to_datetime(
    dict(year=plankton['year'], month=plankton['month'], day=plankton['day']),
    errors='coerce'
)

plankton = plankton.dropna(subset=['lat', 'lon', 'time']).reset_index(drop=True)

plankton['present'] = plankton['presence_absence'].eq('present').astype(int)

plankton.head()


  pd.read_csv('../data/JeDI.csv')


Unnamed: 0,project_title,sub_project_title,owner_dataset,contact,location_name,date,year,month,day,time_local,...,biovolume,biovolume_integrated,weight_wet,weight_dry,presence_absence,study_type,accompanying_ancillary_data,catch_per_effort,time,present
0,Barium_MediterraneanSea,Barium_MediterraneanSea,Gorsky_G_Stemmann_L,Stemman_L,Northwest_Mediterranean_Sea_Mediterranean_Inte...,3/23/03,2003,3.0,23.0,12:14:00,...,,,,,absent,monitoring_project,no,,2003-03-23,0
1,Barium_MediterraneanSea,Barium_MediterraneanSea,Gorsky_G_Stemmann_L,Stemman_L,Northwest_Mediterranean_Sea_Mediterranean_Inte...,3/23/03,2003,3.0,23.0,12:14:00,...,,,,,absent,monitoring_project,no,,2003-03-23,0
2,Barium_MediterraneanSea,Barium_MediterraneanSea,Gorsky_G_Stemmann_L,Stemman_L,Northwest_Mediterranean_Sea_Mediterranean_Inte...,3/23/03,2003,3.0,23.0,12:14:00,...,,,,,absent,monitoring_project,no,,2003-03-23,0
3,Barium_MediterraneanSea,Barium_MediterraneanSea,Gorsky_G_Stemmann_L,Stemman_L,Northwest_Mediterranean_Sea_Mediterranean_Inte...,3/23/03,2003,3.0,23.0,12:14:00,...,,,,,absent,monitoring_project,no,,2003-03-23,0
4,Barium_MediterraneanSea,Barium_MediterraneanSea,Gorsky_G_Stemmann_L,Stemman_L,Northwest_Mediterranean_Sea_Mediterranean_Inte...,3/23/03,2003,3.0,23.0,12:14:00,...,,,,,absent,monitoring_project,no,,2003-03-23,0


## 2. Determine the shared domainThe ERDDAP query is limited to the spatial and temporal coverage of the plankton observations.

In [3]:
time_start = plankton['time'].min().strftime('%Y-%m-%dT00:00:00Z')
time_end = plankton['time'].max().strftime('%Y-%m-%dT23:59:59Z')

lat_min, lat_max = plankton['lat'].min(), plankton['lat'].max()
lon_min, lon_max = plankton['lon'].min(), plankton['lon'].max()

domain_summary = pd.Series({
    'time_start': time_start,
    'time_end': time_end,
    'lat_min': lat_min,
    'lat_max': lat_max,
    'lon_min': lon_min,
    'lon_max': lon_max,
})

domain_summary


time_start    1871-07-01T00:00:00Z
time_end      2011-09-03T23:59:59Z
lat_min                      -78.5
lat_max                      88.74
lon_min                     -180.0
lon_max                      180.0
dtype: object

## 3. Fetch the carbon fields from GLODAPv2We follow the same pattern used in the carbon exploration notebook, but constrain the query to the plankton domain.

In [5]:
e = ERDDAP(
    server='https://erddap.emodnet.eu/erddap',
    protocol='griddap'
)

e.dataset_id = 'GLODAPv2_2016b_CMEMS'

e.griddap_initialize()

e.variables = ['TCO2', 'TAlk', 'pHts25p0']

e.constraints['time>='] = time_start
e.constraints['time<='] = time_end

# Convert longitudes to 0-360 range expected by the dataset
e.constraints['latitude>='] = float(lat_min)
e.constraints['latitude<='] = float(lat_max)
e.constraints['longitude>='] = float((lon_min + 360) % 360)
e.constraints['longitude<='] = float((lon_max + 360) % 360)

carbon_ds = e.to_xarray()

carbon_ds


HTTPError: Error {
    code=404;
    message="Not Found: Your query produced no matching results. Query error: For variable=TCO2 axis#0=time Constraint=\"[(1871-07-01T00:00:00Z):1:(2011-09-03T23:59:59Z)]\": Start=\"1871-07-01T00:00:00Z\" is less than the axis minimum=2020-01-01T00:00:00Z (and even 1.562058432E9).";
}


## 4. Sample the carbon data at each plankton observationWe restrict the carbon dataset to the surface layer and then interpolate values at the plankton coordinates and timestamps.

In [None]:
carbon_surface = carbon_ds.isel(depth=0)

lon_normalized = (plankton['lon'] + 360) % 360

interp = carbon_surface.interp(
    time=('observation', plankton['time'].values),
    latitude=('observation', plankton['lat'].values),
    longitude=('observation', lon_normalized.values),
)

plankton_carbon = plankton.copy()

plankton_carbon['TCO2'] = interp['TCO2'].values
plankton_carbon['TAlk'] = interp['TAlk'].values
plankton_carbon['pHts25p0'] = interp['pHts25p0'].values

plankton_carbon = plankton_carbon.dropna(subset=['TCO2', 'density'])

plankton_carbon.head()


## 5. Correlation analysisWe calculate Pearson correlations between the carbon variables and selected plankton metrics.

In [None]:
metrics = ['density', 'density_integrated', 'biovolume', 'biovolume_integrated', 'present']
variables = ['TCO2', 'TAlk', 'pHts25p0']

available_cols = [col for col in metrics + variables if col in plankton_carbon.columns]

corr = plankton_carbon[available_cols].corr(method='pearson')

corr.loc[
    [v for v in variables if v in corr.index],
    [m for m in metrics if m in corr.columns],
]


### Pairwise relationshipsA seaborn pair-plot helps visualize the relationships that underpin the correlation values.

In [None]:
pairplot_cols = [col for col in variables + ['density', 'present'] if col in plankton_carbon.columns]

sns.pairplot(plankton_carbon[pairplot_cols].dropna(), kind='reg', corner=True)

plt.suptitle('Carbon vs Plankton metrics', y=1.02)

plt.show()


## 6. Save merged dataset (optional)Saving the merged dataset allows further analysis or sharing of intermediate results.

In [None]:
plankton_carbon.to_csv('../data/plankton_with_carbon.csv', index=False)
