In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import seaborn as sns
from scipy.interpolate import griddata
from plotting import plot_equatorial_pacific
from utils import read_files

The NCEI Marine data has no option of narrowing the target data field, i.e longitude and latitude of interest. Since it is of very large size, it was not used.

In [None]:
data = pd.read_csv('data/Marine_CSV_sample.csv')

In [None]:
data.columns

In [None]:
data[['Latitude', 'Longitude', 'Time of Observation', 'Sea Surface Temperature']]

Below is monthly data, collected from various stations in particular area, denoted by the coordinates located in the name of the file.

In [None]:
data = pd.read_csv('data/40_130_30_140.csv')

In [None]:
data.columns

In [None]:
data[['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'SEA_SURF_TEMP']]

In [None]:
data.groupby(['DATE', 'LATITUDE', 'LONGITUDE'])['SEA_SURF_TEMP'].mean()

In [None]:
data['SEA_SURF_TEMP'][data['LATITUDE'] < 0]

In [None]:
data['LATITUDE'].max()

Below is monthly data downloaded from the https://neo.gsfc.nasa.gov/ website. It is monthly data in .nc format.

In [None]:
# reading the dataset for month of July 2024
# dataset = xr.open_dataset('data/AQUA_MODIS.20240701_20240731.L3m.MO.SST.sst.9km.nc')

In [None]:
dataset = xr.open_dataset('data/AQUA_MODIS/LANINA/AQUA_MODIS.20101201_20101231.L3m.MO.SST.sst.9km.nc')

In [None]:
# reading all variables
dataset.variables

Will have to make trasformation on the dataset in order to get the data needed

In [None]:
# extracting to pandas dataframe
raw_data = dataset['sst'].to_dataframe()

# droping the multiindex
tidy_data = raw_data.reset_index()

# filtering lat[20S; 20N]
tidy_data = tidy_data[(tidy_data.lat >= -20) & (tidy_data.lat <= 20)]

# filtering long[130E -> 180 -> -80 W]
tidy_data = tidy_data[(tidy_data.lon >= -180) & (tidy_data.lon <= -80) | (tidy_data.lon >= 130) & (tidy_data.lon <= 180)]

# droping na values
tidy_data = tidy_data.dropna()

# rounding lats and long to 0.5 deg
def round_to_nearest_half(x):
    return np.round(x * 2) / 2

tidy_data.lat = tidy_data.lat.apply(round_to_nearest_half)
tidy_data.lon = tidy_data.lon.apply(round_to_nearest_half)

# grouping by lat and long and averaging the sst
tidy_data = tidy_data.groupby(['lat', 'lon']).sst.mean()

# dropping multiindex
tidy_data = tidy_data.reset_index()

# getting the month
current_month = dataset.attrs['time_coverage_start']

# convert the string to pd.Timestamp object
current_month = pd.to_datetime(current_month)

# normalize timestamp
current_month = current_month.normalize()

# set the month as index of the cleaned data
tidy_data.index = pd.Index([current_month] * len(tidy_data))

In [None]:
# rounding the sst
# raw_data.sst.round(1)
# since the .round() is not working as expected, np used
# np.round(raw_data.sst, 1)
# since np is not working, using apply
tidy_data.sst = tidy_data.sst.apply(lambda x: round(x, 1))

In [None]:
tidy_data['lon'] = np.where(tidy_data['lon'] < 0, tidy_data['lon'] + 360, tidy_data['lon'])

In [None]:
tidy_data.sort_values(['lat', 'lon'])

In [None]:
# getting both the sst and the quality of the observation
combined_raw = dataset[['sst', 'qual_sst']].to_dataframe()

In [None]:
combined_raw = combined_raw.reset_index()

In [None]:
combined_raw.qual_sst.value_counts(dropna=False)

Not explicitly written which is the most reliable observation, however based on common conventions in quality flags, 0.0 is typically used to indicate the highest quality or most reliable observation. For the purpose of the project, the quality of the data is consider to be at sufficient level.

In [None]:
dataset

In [None]:
tidy_data

Lets try and plot the data to see how it looks:

In [None]:
plt.figure(figsize=(30, 10))

# contour map
lon_grid = np.linspace(tidy_data.lon.min(), tidy_data.lon.max(), 100)
lat_grid = np.linspace(tidy_data.lat.min(), tidy_data.lat.max(), 100)
lon_grid, lat_grid = np.meshgrid(lon_grid, lat_grid)
sst_grid = griddata((tidy_data.lon, tidy_data.lat), tidy_data.sst, (lon_grid, lat_grid), method='linear')
plt.contourf(lon_grid, lat_grid, sst_grid, cmap='jet', levels=20)


# plt.scatter(tidy_data.lon, tidy_data.lat, c=tidy_data.sst, cmap='jet', alpha=0.7, s=150, vmin=20, vmax=35)
plt.colorbar(label='Temperature')

x_tick=np.arange(130, 290, 10)
x_label=[f'{x}°E' if x <= 180 else f'{360 - x}°W' for x in x_tick]

y_tick = np.arange(-20, 25, 5)
y_label = [f'{np.abs(x)}°S' if x < 0 else f'{np.abs(x)}°N' for x in y_tick]

box = plt.Rectangle((190, -5), 50, 10, linewidth=2, edgecolor='RED', facecolor='none')
plt.gca().add_patch(box)

plt.axhline(y=0)

plt.xticks(ticks=x_tick, labels=x_label)
plt.yticks(ticks=y_tick, labels=y_label)

plt.xlim(130, 280)
plt.ylim(-20, 20)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title(f'Equatorian Distribution of SST for {current_month.month_name()}-{current_month.year}')  # Title of the plot
plt.show()  # Display the plot

The anomalies observed on the NE and SW quadrants of the diagram are false measurments taken at the earth surface.
We need to extract the python code of reading a dataset and converting it to clean and tidy csv to a function and in addition, to be able to read dirs reccursivly and apply the transformation to all datasets.

The below code is written in a separate module 'utils.py'

In [None]:
result_list = extract_datasets('data/AQUA_MODIS/ELNINO/2015/')

In [None]:
dataset = result_list

In [None]:
df = create_dataframe('esno_2015', *dataset)

After having the combined dataframes I can extract each month data using a 'for' loop and anallyze the data:

In [None]:
for index, data in df.groupby(df.index):
    print(index, data.sst.mean())

In [None]:
for index, data in df.groupby(df.index):
    print(index, data.sst.median())

This is of no use. I need to filter the data for Niño 3.4 region (5°N-5°S, 170°W-120°W)

In [None]:
df_enso_region = df[(df.lat >= -5) & (df.lat <= 5) | (df.lon >= -170) & (df.lon <= -120)]

In [None]:
df_enso_region

In [None]:
for date, data in df_enso_region.groupby(df_enso_region.index):
    print(date, f'Mean: {data.sst.mean().round(2)}, Median: {data.sst.median()}')

Lets export plotting of the equatorial pacific ocean to a function

In [None]:
from plotting import plot_equatorial_pacific

In [None]:
df = pd.read_csv('data/AQUA_MODIS_CSV/esno_2015.csv', index_col=0)
df.index = pd.DatetimeIndex(df.index)

In [None]:
df

In [None]:
plot_equatorial_pacific(df)

Found out that my extracting function is not working as expected. Fixed it. I have the dataframe saved as expected.

Lets extract la nina and see how it works:

In [None]:
lanina = extract_datasets('data/AQUA_MODIS/LANINA/2010/')

In [None]:
lanina = create_dataframe('lanina_2010', *lanina)

In [None]:
plot_equatorial_pacific(lanina)

The data before 2002 is provided from a different satelite. 
The data is in csv format, without indication of the lats and longs.

In [None]:
df = pd.read_csv('data/AQUA_MODIS/ELNINO/1997/AVHRR_SST_M_1997-10-01_rgb_720x360.CSV', header=None)
lat = np.arange(-90, 90, 0.5)
lon = np.arange(-180, 180, 0.5)

In [None]:
df.columns = lon

In [None]:
df

In [None]:
df['lat'] = lat

In [None]:
df = df.melt(id_vars='lat', var_name='lon', value_name='sst')

In [None]:
df = df[(df.lat >= -20) & (df.lat <= 20)]

In [None]:
df = df[(df.lon >= -180) & (df.lon <= -80) | (df.lon >= 130) & (df.lon <= 180)]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

It is working as expected. Now to create a python function to read multiple files. Will modify the extract_database() function to read 