In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os, sys
import itertools
import pickle
import h5py

# local imports
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils import *
from gtv import *
from preprocessing import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# SST Observations

#### Download raw data [here](https://drive.google.com/drive/u/1/folders/1jQ3m8DI0m8Avl5dB3I2AacHuW1ZLWwpw)

The file 'SST_Pacific.mat' includes monthly observations of sea surface temperature (in Celsius) over the Pacific Ocean during 01/1940-12/2018, from the COBE SST v2 dataset.

Temperature over land is replaced with "NaN" values.

The Pacific has been defined as:
   - Latitudes: 60S-60N
   - Longitudes: 80E-280E

Particularly, the file 'SST_Pacific.mat' contains: 

   - 3D matrix 'SST_Pacif': 121 by 201 by 948
   - vector 'lat': 121 by 1 which corresponds to the latitudinal points over Pacific 
   - vector 'lon': 201 by 1 which corresponds to the longitudinal points over Pacific 
   - 2D matrix 'dates': 948 by 2 which corresponds to the dates  
   
   

In [2]:
# load .mat file
monthly_atmo = {}
with h5py.File('../data/SST_pacific.mat') as f:
    for k,v in f.items():
        monthly_atmo[k] = np.array(v)

In [3]:
# flatten 3D array to 2D dataframe
sst = monthly_atmo['SST_Pacif']
obs_lats = monthly_atmo['lat'][0]
obs_lons = monthly_atmo['lon'][0]
flat_df = flatten_series(sst, obs_lats, obs_lons, 'monthly', 79, 1940)
flat_df['var'] = 'sst'
flat_df['val'] = flat_df.temp

In [7]:
# drop nulls (over land)
df = flat_df
df = df.dropna()
df[df.month.isin(['july', 'aug', 'sept', 'oct'])].to_csv('../data/SST_pacific_summer.csv', index=False)

In [10]:
# separately detrend train/test periods
X_train, fts_obs = detrend_and_scale(df, 1940, 1989)
X_test, _ = detrend_and_scale(df, 1990, 2019)

# LENS

#### Download raw data [here](https://drive.google.com/drive/u/1/folders/1ddwOm4wIt6A8-_y9KxI0OdSTqTamhK8i)

The file 'surf_temp.mat' includes monthly data of surface temperature (in Kelvin) over the Pacific Ocean during 01/1920-12/2005, for all 40 ensembles from LENS.

The Pacific has been defined as:

   - Latitudes: 60S-60N
   - Longitudes: 80E-280E
   
Particularly, the file 'surf_temp.mat' contains: 

   - 4D matrix 'TS': 128 by 161 by 1032 by 40
   - vector 'lat': 128 by 1 which corresponds to the latitudinal points over Pacific 
   - vector 'lon': 161 by 1 which corresponds to the longitudinal points over Pacific 

In [11]:
# load .mat file
surf_temp = {}
with h5py.File('/Users/abbystevens/Downloads/surf_temp.mat', 'r') as f:
    for k, v in f.items():
        surf_temp[k] = np.array(v)

In [12]:
surf_temp['TS'].shape

(40, 1032, 161, 128)

First, we interpolate LENS onto the same grid as the observations

In [13]:
from scipy.interpolate import RegularGridInterpolator

lats = surf_temp['lat'][0].copy() #128
lons = surf_temp['lon'][0].copy() #161

# define grid to interpolate 
x = lons.copy()
y = lats.copy()
z = np.arange(surf_temp['TS'].shape[1])

# create grid to interpolate onto
pts = np.array([i for i in itertools.product(z, obs_lons, obs_lats)])

iLENS = np.zeros((40, z.shape[0], obs_lons.shape[0], obs_lats.shape[0])) #initialize empty array

In [14]:
# iterate through trajectories and interpolate
for i in range(40):
    if i%10==0: print(i)
    lens = surf_temp['TS'][i] # extract ith trajectory
    rgi = RegularGridInterpolator((z, x, y), lens, bounds_error=False, fill_value=None) #train interpolator
    ilens = rgi(pts).reshape(z.shape[0], obs_lons.shape[0], obs_lats.shape[0]) #interpolate onto new points
    iLENS[i] = ilens

0
10
20
30


In [18]:
lens = iLENS[0]
lens_df = flatten_lens(lens, obs_lats, obs_lons, 1920)
lens_df['trajectory'] = 0
lens_df = lens_df[lens_df.month.isin(['july', 'aug', 'sept', 'oct'])]
for i in range(1,40):
    if i%10 == 0:
        print(i)
    lens = iLENS[i]
    flat_df = flatten_lens(lens, obs_lats, obs_lons, 1920)
    flat_df = flat_df[flat_df.month.isin(['july', 'aug', 'sept', 'oct'])]
    flat_df['trajectory'] = i
    lens_df = lens_df.append(flat_df, ignore_index=True)

10
20
30


In [26]:
lens0 = lens_df[lens_df.trajectory==0]
Xlens, fts_lens = detrend_and_scale(lens0, 1940, 1989)
for i in range(1, 40):
    if i%10==0: print('finished ', i)
    lensi = lens_df[lens_df.trajectory==i]
    Xi, _ = detrend_and_scale(lensi, 1940, 1989)
    Xlens = np.vstack([Xlens, Xi])
print('done')

finished 10
finished 20
finished 30
done


#### Overlap LENS and Obs 

The land was removed in the obs dataset but not LENS - overlap features for consistency

In [114]:
# store indices and merge
fts_lens['lens_ix'] = fts_lens.index
fts_obs['obs_ix'] = fts_obs.index
fts = pd.merge(fts_lens, fts_obs)


# remove some additional land
fts = fts.loc[~((fts.lon > 250) & (fts.lat > 40))]
fts = fts[(fts.lon!=75)&~((fts.lat==-25)&(fts.lon==145))&~((fts.lat==35)&(fts.lon==115))]

In [117]:
pd.DataFrame(Xlens[:, fts.lens_ix]).to_csv('../data/Xlens.csv', index=False)
pd.DataFrame(X_train[:, fts.obs_ix]).to_csv('../data/X_train.csv', index=False)
pd.DataFrame(X_test[:, fts.obs_ix]).to_csv('../data/X_test.csv', index=False)
fts.to_csv('../data/sst_cols.csv', index=False)