In [None]:
%matplotlib inline
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [None]:
sst_dataset = xr.open_dataset('../datasets/elnino/cci_sst_anomalies_1981_2018.nc')
sst_dataset

In [None]:
sst_anomaly = sst_dataset['sst_anomaly']
timedata = sst_dataset['time']
sst_anomaly

In [None]:
# Restrict area to Nino 3.4
nino34 = sst_anomaly.sel(lat=slice(-5,5), lon=slice(190,240))

# Compute Nino 3.4 index
nino34_timeseries = nino34.mean('lat').mean('lon')
nino34_index = nino34_timeseries.rolling(time=5).mean().dropna("time")

# Plot Nino 3.4 index timeseries
fig = plt.figure(figsize=(8, 4))
ax = fig.gca()
nino34_index.plot(ax=ax)

# Plot threshold lines
start_time, end_time = nino34_index.get_index('time')[0], nino34_index.get_index('time')[-1]
plt.hlines(0.4, start_time, end_time, colors = 'black', linestyles = 'dashed')
plt.hlines(0, start_time, end_time, colors = 'black')
plt.hlines(-0.4, start_time, end_time, colors = 'black', linestyles = 'dashed')
plt.xlabel('date', fontsize=12)
plt.ylabel(b'Ni\xc3\xb1o-3.4 index'.decode("utf-8"), fontsize=12)
plt.show()

In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Generate history dataset
tmin = 1
timelength = 12
tmax = tmin + timelength
nino34_hist = np.array([nino34_index.isel(time=slice(t0,444-tmax+t0)) for t0 in range(tmax-tmin)]).T
nino34_hist.shape

In [None]:
X = nino34_hist
B = nino34_index.isel(time=slice(tmax,444))

train_size = 0.5
nsplit = int(round(nino34_hist.shape[0]*train_size))
train_X, train_B = X[:nsplit], B[:nsplit]
test_X, test_B = X[nsplit:], B[nsplit:]

In [None]:
print(train_X.shape)
print(test_X.shape)
print(X.shape)

In [None]:
# Set up linear model
model_lr = LinearRegression()
# Fit linear model on training data
model_lr.fit(train_X, train_B)

In [None]:
def plot_subgrid(f, X, B, tmax, train_size=0.75, samplesize=4):
    """f is a function that takes an array of size (n_samples, n_features) and outputs
       an array of size (n_samples, n_features)
       X, B are arrays of size (n_samples, n_features)
    """
    Bpred = f(X)
#     print(Bpred.shape)
    tspan = np.linspace(0, tmax, X.shape[0])
    nsplit = tmax*train_size
    fig = plt.figure(figsize=(20, 6))
    ax = plt.subplot(2, 2, 1)
    plt.plot(timedata[tmax+4:], Bpred, label='predicted')
    plt.plot(timedata[tmax+4:], B, label='truth')
    Bmax = np.max(np.stack([Bpred, B]))
    Bmin = np.min(np.stack([Bpred, B]))
#     plt.vlines(timedata[nsplit+4], Bmin-1, Bmax+1, linestyles='dashed')
    ax.set_ylabel(fr'$B_{1}$', fontsize=15)
    ax.set_xlabel('time', fontsize=15)
    plt.show()
    
plot_subgrid(model_lr.predict, X, B, tmax, train_size)

In [None]:
from sklearn.metrics import r2_score

Bpred = model_lr.predict(test_X)
R2_lr = r2_score(test_B, Bpred)
print(f"Score for linear regression: {R2_lr}")