Pre-stored harmonics to fit tide data.

Data from https://tidesandcurrents.noaa.gov/waterlevels.html?id=9414290&units=standard&bdate=20190701&edate=20190801&timezone=GMT&datum=MLLW&interval=6&action=data

In [1]:
import math
import datetime
import pytz
import glob
import functools
import operator
import numpy
import pandas
import matplotlib.pyplot
import matplotlib.pylab
import seaborn
import sklearn.linear_model
import sklearn.metrics
import vtreat.cross_plan

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [2]:
harmonics = pandas.read_csv('harmonics.csv')
harmonics.head()

Unnamed: 0,name,sname,speed,code,i1,i2,i3,i4,i5,i6,phi,nodal,in_hc4,in_hc7,in_hc37,in_hc60,in_hc114
0,aSa,aSa,0.041067,ZZAZZYZ,0,0,1,0,0,-1,0,z,False,False,False,False,False
1,Sa,Sa,0.041069,ZZAZZZZ,0,0,1,0,0,0,0,z,False,False,True,True,True
2,Ssa,Ssa,0.082137,ZZBZZZZ,0,0,2,0,0,0,0,z,False,False,True,True,True
3,Sta,Sta,0.123204,ZZCZZYY,0,0,3,0,0,-1,270,x,False,False,False,False,False
4,MSm,MSm,0.471521,ZAXAZZZ,0,1,-2,1,0,0,0,x,False,False,False,False,False


In [None]:
date_fmt = '%Y/%m/%d %H:%M'
tz = pytz.utc
    
def parse_date(dtstr):
    d0 = datetime.datetime.strptime(dtstr, date_fmt)
    return d0.replace(tzinfo=tz)

base_date_time = datetime.datetime(2001, 1, 1, tzinfo=tz)
first_date_time = datetime.datetime(2019, 6, 1, tzinfo=tz)
cut_date_time = datetime.datetime(2019, 7, 15, tzinfo=tz)

In [None]:
print("TZ NAME: {tz}".format(tz=base_date_time.tzname()))

In [None]:
d0 = parse_date('2001/01/01 00:00')
(d0 - base_date_time).total_seconds()

In [None]:
print("TZ NAME: {tz}".format(tz=d0.tzname()))

In [None]:
tides = pandas.read_pickle('tides.pickle.gz')

In [None]:
tides['train'] = tides['dt']<cut_date_time

In [None]:
tides.head()

In [None]:
tides = tides.loc[tides['dt']>=first_date_time, :]
tides.reset_index(inplace=True, drop=True)

In [None]:
dtrain = tides.loc[tides['train'], :].copy()
dtrain.reset_index(inplace=True, drop=True)

In [None]:
xform = numpy.fft.fft(dtrain['tide feet'])

In [None]:
cutoff = 10**math.floor(numpy.log(-numpy.sort(-abs(xform))[20])/numpy.log(10))
cutoff

In [None]:
pick = abs(xform)>=cutoff
sum(pick)

In [None]:
xform[numpy.logical_not(pick)] = 0j

In [None]:
back = numpy.real(numpy.fft.ifft(xform))

In [None]:
dtrain['fft approx'] = back

In [None]:
seaborn.scatterplot(x='fft approx', y='tide feet', 
                    data=dtrain, 
                    alpha=0.5)
info = matplotlib.pyplot.title("fft approximation on training set")

In [None]:
sklearn.metrics.r2_score(dtrain['tide feet'], dtrain['fft approx'])

In [None]:
# freqs are defined as cycles per sample spacing
freqs = numpy.fft.fftfreq(dtrain.shape[0])
freqs = numpy.sort(numpy.unique([abs(f) for f in freqs[pick]]))
freqs = [f for f in freqs if f > 0]

In [None]:
sample_spacing_seconds = dtrain['dts'][1] - dtrain['dts'][0]

In [None]:
periods_seconds = [sample_spacing_seconds/f for f in freqs]

In [None]:
vars = []
for ps in periods_seconds:
    vs = 'sin(second/' + str(ps) + ')'
    dtrain[vs] = numpy.sin(2*numpy.pi*dtrain['dts']/ps)
    tides[vs] = numpy.sin(2*numpy.pi*tides['dts']/ps)
    vc = 'cos(second/' + str(ps) + ')'
    dtrain[vc] = numpy.cos(2*numpy.pi*dtrain['dts']/ps)
    tides[vc] = numpy.cos(2*numpy.pi*tides['dts']/ps)
    vars = vars + [vs, vc]
    

In [None]:
fitter = sklearn.linear_model.ElasticNet(fit_intercept=True, 
                                         alpha = 1e-4,
                                         max_iter=10000)
fitter.fit(dtrain[vars], dtrain['tide feet'])
#fitter.coef_

In [None]:
dtrain['predict'] = fitter.predict(dtrain[vars])

In [None]:
seaborn.scatterplot(x='predict', y='fft approx', 
                    data=dtrain, alpha=0.5)
info = matplotlib.pyplot.title("linear model compared to fft approximation on training set")

Now try to extrapolate.

In [None]:
tides['predict'] = fitter.predict(tides[vars])

In [None]:
dtest = tides.loc[numpy.logical_not(tides['train']), :].copy()
dtest.reset_index(inplace=True, drop=True)

In [None]:
seaborn.lineplot(x='dt', y='tide feet', 
                 data=dtest)
info = matplotlib.pylab.xticks(rotation=45)
info = matplotlib.pyplot.title("test data")

In [None]:
seaborn.lineplot(x='dt', y='predict', 
                 data=dtest, color='black')
info = matplotlib.pylab.xticks(rotation=45)
info = matplotlib.pyplot.title("prediction in test region")

In [None]:
test_plot = tides.loc[numpy.logical_not(tides['train']), :]
seaborn.lineplot(x='dt', y='predict', 
                 data=test_plot,
                 color='black',
                 alpha=0.5)
seaborn.lineplot(x='dt', y='Preliminary (ft)', 
                 data=test_plot, 
                 color='blue',
                 alpha=0.5)
info = matplotlib.pylab.xticks(rotation=45)
info = matplotlib.pyplot.title("prediction (black) superimposed on test data")

In [None]:
seaborn.scatterplot(x='predict', y='tide feet', 
                    data=dtest, 
                    alpha=0.5)
info = matplotlib.pyplot.title("predictions on test data")

In [None]:
sklearn.metrics.r2_score(dtest['tide feet'], dtest['predict'])

Now try to cross-validate for better regularization parameters.

In [None]:
alphas = [ 10 ** k for k in range(-5, 5, 1) ]
print(alphas)
l1_ratios = numpy.arange(0, 1, 0.05)
print(l1_ratios)
grid = [ [ {"alpha": alpha, "l1_ratio": l1_ratio} for alpha in alphas ] for l1_ratio in l1_ratios ]
grid = functools.reduce(operator.concat, grid)
grid[0]

In [None]:
def cross_predict_model(fitter, X, Y, plan):
    preds = numpy.zeros(X.shape[0])
    for g in range(len(plan)):
        pi = plan[g]
        model = fitter.fit(X.iloc[pi["train"]], Y.iloc[pi["train"]])
        predg = model.predict(X.iloc[pi["app"]])
        preds[pi["app"]] = predg
    return preds

def est_quality(settings, plan, dtrain, mvars, outcome='y'):
    fitter = sklearn.linear_model.ElasticNet(alpha = settings["alpha"], 
                                             l1_ratio = settings["l1_ratio"], fit_intercept=True)
    preds = cross_predict_model(fitter, dtrain[mvars], dtrain[outcome], plan)
    mean_sq_error = numpy.mean((dtrain[outcome] - preds)**2)
    return mean_sq_error

In [None]:
fitter = sklearn.linear_model.ElasticNet(fit_intercept=True, 
                                         max_iter=10000)
cross_plan = vtreat.cross_plan.order_cross_plan(k_folds=5, order_vector=dtrain['dts'])

In [None]:
%%capture
param_evals = [ {"settings" : settings, "loss" : est_quality(settings, cross_plan, dtrain, vars, 'tide feet')} for settings in grid ]

In [None]:
min_loss = numpy.min([ q["loss"] for q in param_evals ])
best_params = [ q for q in param_evals if q["loss"] <= min_loss + 1e-9 ]
best_params

In [None]:
settings = best_params[0]["settings"]
fitter = sklearn.linear_model.ElasticNet(alpha = settings["alpha"], 
                                         l1_ratio = settings["l1_ratio"], 
                                         fit_intercept=True,
                                         max_iter=1000)
model = fitter.fit(dtrain[vars], dtrain['tide feet'])

In [None]:
dtest['pred2'] = fitter.predict(dtest[vars])

In [None]:
seaborn.scatterplot(x='pred2', y='tide feet', 
                    data=dtest, 
                    alpha=0.5)
info = matplotlib.pyplot.title("pred2 on test data")

In [None]:
sklearn.metrics.r2_score(dtest['tide feet'], dtest['pred2'])

No real change in this case.