In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False  # if autocomplete is not working

In [63]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import io
from collections import Counter
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process import kernels
from sklearn.linear_model import RidgeCV, QuantileRegressor, GammaRegressor
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVR, SVR, LinearSVC
from sklearn.metrics import average_precision_score

In [4]:
df = pd.read_csv('../dev/train-data.csv').dropna()
df['date'] = pd.to_datetime(df['date'])

In [6]:
offsets = [1, 2, 3, 7]
train_data = []
test_data_by_offset = {o: [] for o in offsets}
dates = []

past_days = 3
for start_day in range(past_days, len(df) - max(offsets)):
    x = df.iloc[start_day - past_days:start_day]
    train_data.append(
        np.concatenate([
            # all levels
            x[[c for c in levels_df.columns if c != 'date']].values.flatten(),
            # sum of past precip 
            x[[c for c in levels_df.columns if 'prec-' in c]].sum(axis=0).values.flatten(),
        ])
    )
    dates.append(df.iloc[start_day]['date'])
    d = dates[-1]
    for o in offsets:
        yy = df.iloc[start_day:start_day+o]['level-12'].max()
        assert np.isfinite(yy)
        test_data_by_offset[o].append(yy)

train_data = np.array(train_data)
test_data_by_offset = {k: np.array(v).reshape((-1, 1)) for k, v in test_data_by_offset.items()}

In [106]:
train_test_cut = -359
thresholds = [600, 650, 700, 740, 770, None]

pred_df = []
for t in thresholds:
    for o in offsets:
        # data prep
        X_train, y_train = train_data[:train_test_cut], test_data_by_offset[o][:train_test_cut]
        X_test, y_test = train_data[train_test_cut:], test_data_by_offset[o][train_test_cut:]
        xs = StandardScaler().fit(X_train)

        ws = y_train.flatten()
        ws = ws - ws.min()
        ws = ws / ws.max()

        if t is None:
            sv = LinearSVR(C=25.0, dual=False, epsilon=0.001, loss="squared_epsilon_insensitive", tol=1e-05)
            sv.fit(xs.transform(X_train), y_train.ravel(), sample_weight=np.exp(ws))
            ps = sv.predict(xs.transform(X_test))
            
            print('offset', o)
            print('  abs error', np.mean(np.abs(ps - y_test.ravel())))
            print('  sq error', np.mean((ps - y_test.ravel())**2))
            
        else:
            y_train = (y_train.ravel() > t).astype(int)
            y_test = (y_test.ravel() > t).astype(int)
            if not any(y_test):
                continue

            sv = LinearSVC(C=25.0, dual=False, loss="squared_hinge", tol=1e-05)
            sv.fit(xs.transform(X_train), y_train, sample_weight=np.exp(ws))
            ps = sv.decision_function(xs.transform(X_test))
            
            print('offset', o, 'threshold', t)
            print('  aps:', average_precision_score(y_test.flatten(), ps))
            print('  acc', np.mean((y_test.flatten() > 0.5) == (ps > 0)))

        pred_df.append(pd.DataFrame({
            'date': dates[train_test_cut:],
            'prediction': ps,
            'observed': y_test.flatten(),
            'offset': o,
            'threshold': t,
        }))

offset 1 threshold 600
  aps: 0.9036045107698254
  acc 0.9832869080779945
offset 2 threshold 600
  aps: 0.8412551596249387
  acc 0.9721448467966574
offset 3 threshold 600
  aps: 0.7656784588025463
  acc 0.9526462395543176
offset 7 threshold 600
  aps: 0.6178095403765815
  acc 0.8885793871866295
offset 1 threshold 650
  aps: 0.8257813242139262
  acc 0.9860724233983287
offset 2 threshold 650
  aps: 0.6360486162799605
  acc 0.9693593314763231
offset 3 threshold 650
  aps: 0.5062281232941422
  acc 0.9526462395543176
offset 7 threshold 650
  aps: 0.42200889559853494
  acc 0.9108635097493036
offset 1 threshold 700
  aps: 0.6844827586206896
  acc 0.9944289693593314
offset 2 threshold 700
  aps: 0.5001082251082251
  acc 0.9888579387186629
offset 3 threshold 700
  aps: 0.3895835419774942
  acc 0.9832869080779945
offset 7 threshold 700
  aps: 0.21534943845805482
  acc 0.9610027855153204
offset 1 threshold 740
  aps: 0.325
  acc 0.9944289693593314
offset 2 threshold 740
  aps: 0.5178571428571428


In [91]:
pdf = pd.concat(pred_df)

In [92]:
pdf['date'] = pdf['date'].astype(str)

In [93]:
rows = [
    dict(zip(pdf.columns, c)) for c in pdf.itertuples(False, None)
]

In [94]:
import json

In [95]:
with open('/tmp/predictions.json', 'w') as f:
    json.dump(rows, f)

In [96]:
pdf

Unnamed: 0,date,prediction,observed,offset,threshold
0,2020-01-01,-2.683016,0.0,1,600
1,2020-01-02,-2.795412,0.0,1,600
2,2020-01-03,-2.786946,0.0,1,600
3,2020-01-04,-2.532901,0.0,1,600
4,2020-01-05,-2.647502,0.0,1,600
...,...,...,...,...,...
354,2020-12-20,456.215546,472.0,7,
355,2020-12-21,447.023315,472.0,7,
356,2020-12-22,463.574813,472.0,7,
357,2020-12-23,474.282589,472.0,7,


In [97]:
pdf['threshold'].unique()

array([600, 650, 700, 740, 770, None], dtype=object)