In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

tqdm.pandas()
from PyEMD import CEEMDAN, Visualisation

## Notes
The final program will likely be a CLI with arguments, some of the options should be:
- Input data
- Output data
- Whether to interpolate over missing data (initially linear)
- Whether to reject components which are primarily noise (with note that this may not be valid with CEEMDAN)
    - Apriori or aposteriori test? 
- Tolerance when matching components by frequency
- Time range for training/testing/prediction data
- Noise values

In [None]:
# Load data
# TODO - note that we're not interpolating over gaps here
from data_processing.bridge import load_shorecast, load_hindcast, load_SLP

shore_df = load_shorecast()
hind_df = load_hindcast()
pca_df = load_SLP()

In [None]:
# Set up a list of each timeseries to decompose
# Each item in the list is a tuple of:
#   label, source df, source column
series = [
    ('output', shore_df, 'y'),
] + [
    (l, hind_df, l) for l in hind_df.columns
] + [
    (l, pca_df, l) for l in pca_df.columns
]

In [None]:
# Set up CEEMD driver
NR = 100
ns = np.arange(0.1, 0.5, .05)

In [None]:
# Individually decompose each signal
folder = 'output/imfs'
os.makedirs(folder, exist_ok=True)

for label, df, col in series:
    file = f'{folder}/{label}.csv'
    if os.path.exists(file):
        print(f'Skipping {label}')
    else:
        print(f'Decomposing {label}')
        ceemd = CEEMDAN(trails=NR, epsilon=ns[0], processes=8, parallel=True)
        imfs = ceemd.ceemdan(df[col].to_numpy(), df.index.to_numpy(), progress=True)
        imfs_df = pd.DataFrame(imfs.T, index=df.index)
        imfs_df.to_csv(file)

In [None]:
# Load all the IMFs
all_imfs = {}
for label, _, _ in series:
    file = f'{folder}/{label}.csv'
    all_imfs[label] = pd.read_csv(file, index_col=0)

In [None]:
## Reject components which are primarily noise
# Note that this might not be valid when applied to the result of CEEMDAN (rather than EMD), so it should be optional
from PyEMD.checks import whitenoise_check
for label, imf_df in all_imfs.items():
    print(f'Checking {label}')
    sig = whitenoise_check(imf_df.to_numpy().T) #, test_name='apriori')
    rejects = [k for k, v in sig.items() if v == 0]
    print(f'Rejecting: {rejects}')
    all_imfs[label] = imf_df.drop(columns=[str(i-1) for i in rejects])

In [None]:
## Match up components in input/output by frequency

In [None]:
## Linear regression of components to output signal

In [None]:
## Recreate output signal