In [0]:
ENV = 'colab'  # 'colab'
if ENV == 'colab':
    !pip install -q PyDrive imbalanced-learn ipdb hyperopt

# Rete neurale per extreme returns su 2 azioni - con sentiment

Questo notebook contiene la parte di rete neurale per confronto con l'analisi statistica. Qui faremo *solo* l'ottimizzazione degli iperparametri, l'addestramento finale con i parametri ottimali trovati sarà fatta in un altro notebook.

Il flusso è il seguente:

- [x] utilizzo del dataset *S&P500* con la massima ampiezza storica disponibile (2005 - 2018)
- [x] calcolo dei log returns
- [x] selezione di due stocks, quelle con la minima e la massima volatilità in nel training set considerato
- [x] creazione estremi al 95%
- [x] oversampling con due possibili strategie: replicare le istanze positive, o replicarle con aggiunta di rumore gaussiano
- [x] addestramento rete con hyperparameter optimization
- [x] ripetizione di ottimizzazione iperparametri per tutte e due le azioni con aggiunta di sentiment

Nell'altro notebook dovrò fare:
- [ ] utilizzo stesse metriche (ROC, KSS, Precision, Recall, Utility) che nel paper
- [ ] confronto con i risultati del modello probabilistico
- [ ] conclusioni

In [0]:
import os
import gc
import time
import datetime
from typing import Any, Dict, List, Tuple, Union
import pickle
import copy
import pprint
import uuid

import numpy as np
import pandas as pd
import pandas.testing as pt
import sklearn.metrics as sm
import sklearn.preprocessing as skpp
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler
import hyperopt as hy
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK

import keras

import matplotlib.pyplot as pl
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns

import ipdb

%pdb on

In [0]:
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/gdrive')

Un po' di dichiarazioni utili per il seguito

In [0]:
stock_type = ['min_vol', 'max_vol']
return_type = ['pos', 'neg', 'abs']
q_type = '95'
rs = 42  # random state
MAX_EPOCHS = 1000

stock_codes = {
    'min_vol': '9CE4C7',
    'max_vol': 'E28F22'
}  # già trovate in Paper-azioni.ipynb

stock_colors = {
    'min_vol': 'palegoldenrod',
    'max_vol': 'coral',
}

# i giorni sono i primi disponibili in quel mese nei dati
split_dates = {
    'subprime-crisis': datetime.datetime(2007, 1, 3), # subprime crisis
    'subprime-crisis-start': datetime.datetime(2007, 1, 3), # subprime crisis
    'subprime-crisis-halfway': datetime.datetime(2008, 9, 2),
    'subprime-crisis-end': datetime.datetime(2010, 1, 4),
    'eu-debt': datetime.datetime(2011, 1, 3), # EU sovereign debt crisis
    'eu-debt-halfway': datetime.datetime(2012, 1, 3), # EU sovereign debt crisis
    'last_train': datetime.datetime(2017, 1, 3), 
}

## 1. Importazione dei dati 

Per importare i dati dobbiamo caricarli, e poi usare la stategia "taglia-e-cuci" usata in `Paper-azioni.ipynb`.

In [0]:
if ENV == 'colab':
    data_path = '/gdrive/My Drive/OptiRisk Thesis/data'
    base_path = '/gdrive/My Drive/OptiRisk Thesis/experiments/11_final_experiment'
else:
    data_path = "/Users/pietro/Google Drive/OptiRisk Thesis/data"
    base_path = "/Users/pietro/Google Drive/OptiRisk Thesis/experiments/11_final_experiment"

prices_path = os.path.join(data_path, 'prices', 'adjusted_prices_volume.csv')
ta_dir = os.path.join(data_path, 'technical_features', 'features_all_years')
impact_path = os.path.join(data_path, 'sentiment', 'impactFinal.csv')
print(f"BASE path: {base_path}")
print(f"TA dir: {ta_dir}")

Conversione delle date e settaggio dell'index del dataframe

In [0]:
prices = pd.read_csv(prices_path)
prices.loc[:, 'date'] = pd.to_datetime(prices['date'], format="%Y%m%d")
prices.index = prices['date']
prices.drop(columns=['date'], inplace=True)
prices.head()

Trasformiamola un una serie temporale, ogni riga una data, ogni colonna un'azione.

I prezzi:

In [0]:
prices_ts = prices.pivot(columns='ravenpackId', values='close')
prices_ts_no_nan = prices_ts.dropna(axis='columns', how='any', inplace=False)
prices_ts_no_nan.head()

I volumi:

In [0]:
volume_ts = prices.pivot(columns='ravenpackId', values='volume')
volume_ts_no_nan = volume_ts.loc[:, prices_ts_no_nan.columns]
pt.assert_index_equal(prices_ts_no_nan.columns, volume_ts_no_nan.columns, check_names=False)
volume_ts_no_nan.head()

Ora carico l'impact score:

In [0]:
impact = pd.read_csv(impact_path)
impact.loc[:, 'date'] = pd.to_datetime(impact['date'], format="%Y%m%d")
impact.index = impact['date']
impact.drop(columns=['date'], inplace=True)
impact.head()

In [0]:
impact_ts = impact.pivot(columns='companyId', values=['PosImpact', 'NegImpact'])
impact_ts_no_nan = impact_ts.dropna(axis='columns', how='any', inplace=False)
impact_ts_no_nan.head()

Ora calcoliamo i log-returns, le direzioni, i volumi e gli impact:

In [0]:
log_returns = np.log(prices_ts_no_nan).diff(periods=1).iloc[1:, :]
directions_ts_no_nan = prices_ts_no_nan.diff(periods=1).iloc[1:, :]
prices_ts_no_nan = prices_ts_no_nan.iloc[1:, :]
volume_ts_no_nan = volume_ts_no_nan.iloc[1:, :]

impact_ts_no_nan = impact_ts_no_nan.loc[prices_ts_no_nan.index]

pt.assert_index_equal(prices_ts_no_nan.index, volume_ts_no_nan.index)
pt.assert_index_equal(prices_ts_no_nan.index, log_returns.index)
pt.assert_index_equal(prices_ts_no_nan.index, directions_ts_no_nan.index)
pt.assert_index_equal(prices_ts_no_nan.index, impact_ts_no_nan.index)

Mi conviene creare una funzione che standardizzi le features, visto che poi ne avrò più di una (es: returns + volume).

In [0]:
def only_train_notime(feature: pd.Series) -> pd.Series:
    """Just return the training part of a Series."""
    f = feature[np.logical_or(
        feature.index < split_dates['subprime-crisis-halfway'],
        np.logical_and(
            feature.index >= split_dates['eu-debt-halfway'],
            feature.index < split_dates['last_train']
        )
    )]

    return f

def standardize(feature: pd.Series) -> pd.Series:
    """Standardize a feature by computing the statistics on the training set."""
    # prendo solo la parte di training, perdendo ogni riferimento alla
    # sequenza temporale
    tmp_feature_train = only_train_notime(feature)

    scaler = skpp.RobustScaler()
    scaler.fit(tmp_feature_train.values.reshape(-1, 1))

    result = pd.Series(
        data=scaler.transform(feature.values.reshape(-1, 1)).flatten(),
        index=feature.index
    )
    
    return result

Ora creo i thresholds:

In [0]:
# ora creo i dati per i returns (non standardizzati), i thresholds e i volumi (standardizzati)
lr_train_notime = dict()
lr_test_notime = dict()
returns_train_notime = dict()

# aggiungiamo i dati in modalità taglia-e-cuci
for s_type, s_code in stock_codes.items():
    # training set
    lr_current = log_returns.loc[:, s_code]
    lr_train_notime[s_type] = only_train_notime(lr_current)
    
    # returns train, tutti POSITIVI
    returns_train_notime[s_type] = {
        'pos': lr_train_notime[s_type][lr_train_notime[s_type] > 0.0],
        'neg': -(lr_train_notime[s_type][lr_train_notime[s_type] < 0.0]),
        'abs': lr_train_notime[s_type].abs()
    }

    

# ora creo i threshold
thresholds = {
    s_type: {
        ret_type: {
            q_type: returns_train_notime[s_type][ret_type].quantile(0.95)
        }
        for ret_type in return_type
    }
    for s_type in stock_type
}

ed infine creo i DataFrame e gli arrays che contengono tutti gli estremi e tutti i dati.

Le features che qui utilizziamo sono:

- log-returns standardizzati
- volume scambiato standardizzato
- tutte le features di TA che ci sono nel white paper di Douglas

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for s_type, s_code in stock_codes.items():
    print(f"Stock type: {s_type}")
    print("-"*30)
    features[s_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[s_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[s_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
    print("adding positive and negative sentiment impact")
    features[s_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
    features[s_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]
    
    
    print("-" * 30)
    print("")

In [0]:
extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for s_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[s_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[s_type][name] for name in feature_names] + \
              [features[s_type]['pos_impact'], features[s_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names + ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[s_type] = tmp_df
    extremes_all[s_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[s_type]['pos'][q_type],
        lr <= -thresholds[s_type]['neg'][q_type],
    )
    
    extremes_all[s_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[s_type] = direction

## 2. Creazione dataset train-test per TensorFlow

Ora che ho i thresholds, posso creare il dataset vero e proprio, cioè:

- X: cubo dati
- y: estremo si/no

Per prima cosa, creo delle funzioni che mi creano i dati:

In [0]:
# testata, funziona con array, Series e DataFrame
def rolling_window(data: np.ndarray,
                   start: int,
                   end: int,
                   lookback: int) -> np.ndarray:
    """
    Create a rolling window view of data, starting at index start, finishing
    at index end, with loockback days of bptt.
    
    Parameters
    ----------
    data: series, dataframe or array
        the data, containing one row for each time point and one column for each feature
        
    start: int
        starting index in the data
        
    end: int
        index where the whole thing ends, data[end] is **excluded**
        
    lookback: int
        length of the lookback period
        
    Returns
    -------
    X: np.ndarray
        array of shape(n_points, lookback, n_features)
    """
    assert lookback < data.shape[0]  # lookback sano
    assert start - lookback + 1 >= 0  # lookback sano
    
    n_features = data.shape[1]
    n_points = end - start
    
    X = np.zeros((n_points, lookback, n_features), dtype = data.dtype)
    
    # range strano per l'indicizzazione numpy
    for i, t in enumerate(range(start + 1, end + 1)):
        X[i, :, :] = data[t - lookback:t, :]
        
    return X


# testata, funziona hehehe
def rolling_window_xyd(data: Union[pd.Series, pd.DataFrame],
                      targets: List[pd.Series],
                      start: int,
                      end: int,
                      lookback: int) -> Tuple[np.ndarray, List[np.ndarray], pd.Series]:
    """
    Create X, y and dates in a single shot.
    The returned dates are relative to the y array(s).
    """
    if isinstance(data, pd.Series):
        my_data = data.values.reshape(-1, 1)
    elif isinstance(data, pd.DataFrame):
        my_data = data.values
    else:
        raise TypeError("data should be a pandas Series or Dataframe")

    X = rolling_window(my_data, start, end, lookback)
    
    if not isinstance(targets, list):
        raise TypeError("target must be a list of pandas Series")
    if not all(isinstance(t, pd.Series) for t in targets):
        raise TypeError("all targets should be pandas Series")
    if not all(isinstance(t.index, pd.DatetimeIndex) for t in targets):
        raise TypeError("index of target should be a pandas DatetimeIndex")
        
    y = [t.values[start + 1:end + 1] for t in targets]
    dates = pd.Series(data=targets[0].index[start + 1: end + 1])
        
    return X, y, dates


# TESTATO: funziona
def create_Xyd(returns: Union[pd.Series, pd.DataFrame],
               extremes: pd.Series,
               directions: pd.Series,
               lookback: int) -> Tuple[
    np.ndarray, np.ndarray, List[np.ndarray], List[np.ndarray], pd.Series, pd.Series
]:
    """
    Create the X, y and dates arrays for the ANN.
    """
    test_start_1 = returns.index.get_loc(split_dates['subprime-crisis-halfway'])
    test_end_1 = returns.index.get_loc(split_dates['eu-debt-halfway'])
    test_start_2 = returns.index.get_loc(split_dates['last_train'])

    # TRAIN
    tmp_X_train_1, tmp_y_train_1, tmp_dates_train_1 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=lookback - 1,  # sempre lookback - 1 se il primo iniziale
        end=test_start_1,
        lookback=lookback
    )

    tmp_X_train_2, tmp_y_train_2, tmp_dates_train_2 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_end_1,  # sempre lookback - 1 se il primo iniziale
        end=test_start_2,
        lookback=lookback
    )
    
    assert len(tmp_y_train_1) == len(tmp_y_train_2)
    
    X_train = np.concatenate([tmp_X_train_1, tmp_X_train_2])
    y_train = [np.concatenate([tmp_y_train_1[i], tmp_y_train_2[i]]) for i in range(len(tmp_y_train_1))]
    dates_train = pd.concat([tmp_dates_train_1, tmp_dates_train_2], axis=0, ignore_index=True).values
    assert X_train.shape[0] == dates_train.shape[0]
    assert all(yy.shape[0] == X_train.shape[0] for yy in y_train)

    # TEST
    tmp_X_test_1, tmp_y_test_1, tmp_dates_test_1 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_start_1,  # sempre lookback - 1 se il primo iniziale
        end=test_end_1,
        lookback=lookback
    )
    
    tmp_X_test_2, tmp_y_test_2, tmp_dates_test_2 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_start_2,  # sempre lookback - 1 se il primo iniziale
        end=returns.shape[0] - 1,
        lookback=lookback
    )
  
    X_test = np.concatenate([tmp_X_test_1, tmp_X_test_2])
    y_test = [np.concatenate([tmp_y_test_1[i], tmp_y_test_2[i]]) for i in range(len(tmp_y_test_1))]
    dates_test = pd.concat([tmp_dates_test_1, tmp_dates_test_2], axis=0, ignore_index=True).values
    assert X_test.shape[0] == dates_test.shape[0]
    assert all(yy.shape[0] == X_test.shape[0] for yy in y_test)

    return X_train, X_test, y_train, y_test, dates_train, dates_test


def split_stratified(X: np.ndarray,
                     y: List[np.ndarray],
                     dates: np.ndarray,
                     test_size=0.2,
                     random_state=rs,
                     verbose=False):
    """
    Split a dataset in a stratified fashion on the target variable y[0].
    """
    assert X.ndim == 3
    # divido in train-validation, lo faccio prendendo gli indici dagli estremi y/n con un
    # ShuffleSplit che divide a caso
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    n_samples = X.shape[0]
    n_features = X.shape[2]
    
    XX = np.zeros(n_samples, dtype=np.int8)
    
    if verbose:
        for i in range(len(y)):
            vals, counts = np.unique(y[i], return_counts=True)
            for v, c in zip(vals, counts):
                print(f"y[{i}] has {c} elements of class {v}")
    
    train_index, test_index = next(splitter.split(XX, y[0]))
    
    X_train = X[train_index]
    X_validation = X[test_index]
    
    y_train = [yy[train_index] for yy in y]
    y_validation = [yy[test_index] for yy in y]
    
    dates_train = dates[train_index]
    dates_validation = dates[test_index]

    return X_train, X_validation, y_train, y_validation, dates_train, dates_validation


def oversample_mtl(X: np.ndarray, y: List[np.ndarray], random_state=rs, dt=np.float32):
    """Oversample a dataset on the positive 1 class."""
    assert X.dtype == dt
    assert X.ndim == 3
    assert isinstance(y, list) and all(yy.ndim == 1 for yy in y) and all(yy.dtype == dt for yy in y)
    
    # oversample
    ro = RandomOverSampler(random_state=random_state)
    nx = X.shape[0]
    indexes = np.arange(nx).reshape(nx, 1)
    
    indexes_resampled, y_resampled = ro.fit_resample(indexes, y[0])
    ir = indexes_resampled.flatten()
    
    X_resampled = X[ir]
    y_resampled = [yy[ir] for yy in y]
    
    return X_resampled, y_resampled

## 3. Addestramento rete

Visto che serve l'ottimizzazione degli iperparametri, è importante avere una funzione da ottimizzare con Hyperopt. Aggiungiamo le funzioni che servono.

La prima crea il modello:

In [0]:
def create_model_mtl(space: Dict[str, Any],
                     bptt: int,
                     n_features: int) -> keras.models.Model:
    """Create a model using the parameters in the search space."""
    l = space['layers']

    input_dropout = float(l['input_dropout'])
    assert input_dropout >= 0.0 and input_dropout <= 1.0

    n_layers = int(l['num_layers']['how_many'])
    assert n_layers <= 2 and n_layers > 0

#     n_cells_1 = int(l['num_layers']['n_cells_1'])
#     assert n_cells_1 >= 1

    # creo il modello
    model_input = keras.Input(shape=(bptt, n_features), name='model_input')

    if n_layers == 1:
        if input_dropout > 0.0:
            x = keras.layers.LSTM(n_features, dropout=input_dropout)(model_input)
        else:
            x = keras.layers.LSTM(n_features)(model_input)
    elif n_layers == 2:
        n_cells_2 = int(l['num_layers']['n_cells_2'])
        x = keras.layers.LSTM(n_features, return_sequences=True)(model_input)
        x = keras.layers.LSTM(n_cells_2)(x)
    elif n_layers == 3:
        n_cells_2 = int(l['num_layers']['n_cells_2'])
        n_cells_3 = int(l['num_layers']['n_cells_3'])
        x = keras.layers.LSTM(n_features, return_sequences=True)(model_input)
        x = keras.layers.LSTM(n_cells_2, return_sequences=True)(x)
        x = keras.layers.LSTM(n_cells_3)(x)

    output_is_extreme = keras.layers.Dense(
        2, activation='softmax', name='extreme')(x)
    output_is_up_down = keras.layers.Dense(
        2, activation='softmax', name='up_down')(x)

    model = keras.Model(
        inputs=model_input,
        outputs=[output_is_extreme, output_is_up_down],
        name='MTL_model')

    return model

La seconda valuta le performance e trova i TP, FP, TN, FN:

In [0]:
def get_tptnfpfn(y_true, y_pred, labels=[0, 1]) -> Dict[str, int]:
    """Get the tp, tn, fp, fn count for y_true and y_pred."""
    cm = sm.confusion_matrix(y_true, y_pred, labels=labels)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]

    return {
        'tp': tp,
        'fp': fp,
        'tn': tn,
        'fn': fn,
    }


def compute_performance(tp, tn, fp, fn, weighted=False):
    support_1 = tp + fn
    support_0 = tn + fp
    
    if tp + fp > 0:
        prec_1 = tp / (tp + fp)
    else:
        prec_1 = np.nan
        
    if tn + fn > 0:
        prec_0 = tn / (tn + fn)
    else:
        prec_0 = np.nan
    
    rec_1 = tp / (tp + fn)
    rec_0 = tn / (tn + fp)
    
    if any(np.isnan(x) for x in [prec_1, prec_0, rec_1, rec_0]):
        return np.nan
    else:
        f1_1 = 2.0 * (prec_1 * rec_1) / (prec_1 + rec_1)
        f1_0 = 2.0 * (prec_0 * rec_0) / (prec_0 + rec_0)
        
        if weighted: # pesa di più la classe di maggioranza
            f1_tot = np.average([f1_0, f1_1], weights=[support_0, support_1])
        else:
            f1_tot = np.mean([f1_0, f1_1])
            
        return f1_tot

La terza mi crea il dataframe dei risultati che poi posso analizzare:

In [0]:
def get_new_results() -> pd.DataFrame:
    return pd.DataFrame(
        data=None,
        columns=[
            'dataset',
            'optimizer',
            'start_time',
            'experiment_id',
            'trial',
            'bptt',
            'lr',
            'n_layers',
            'n_cells_1',
            'n_cells_2',
            'n_cells_3',
            'dropout',
            'loss_extreme',
            'loss_up_down',
            'loss_volume',
            'f1_validation_extreme',
            'f1_validation_up_down',
            'tp_extreme',
            'tn_extreme',
            'fp_extreme',
            'fn_extreme',
            'tp_up_down',
            'tn_up_down',
            'fp_up_down',
            'fn_up_down',
            'train_epochs',
            'es_min_delta',
            'es_patience',
            'use_class_weight',
            'output',
        ])

Infine l'ultima esegue l'esperimento.

In [0]:
def run_experiment(space,
                   stock_type: str,
                   max_epochs: int,
                   data: Union[pd.Series, pd.DataFrame],
                   extremes: pd.Series,
                   directions: pd.Series,
                   n_runs: int,
                   verbose: int,
                   results_path: str):
    """Fai girare un singolo esperimento.
    
    Parameters
    ----------
    space: 
        hyperopt search space
        
    stock_type:
        type of stock to use
    
    max_epochs: int
        number of max epochs to tun the model for
        
    data: pd.Series of shape (n_timepoints,), or pd.DataFrame of shape (n_timepoints, n_features)
        data containing returns, volume and all other things, where every row is
        a timepoint and every column a different feature
        
    extremes: pd.Series of shape (n_timepoints,)
        target for the extremes, binary 1/0, 
        
    directions: pd.Series of shape (n_timepoints,)
        target for the directions
        
    n_runs: int
        how many times to run a model with the same structure to get a reliable
        estimate of the loss function
        
    verbose: int
        verbosity for Keras
        
    results_path: str or path
        path where to save the intermediate runs
    """
    sigmoid_or_softmax = 'softmax'
    if data.ndim == 1:
        n_features = 1
    else:
        n_features = data.shape[1]

    lookback = bptt = int(space['bptt'])
    batch_size = data.shape[0]
    
    # 1. creazione dataset per questo lookback
    X_trv, X_test, y_trv, y_test, dates_trv, dates_test = create_Xyd(
        data.astype(np.float32),
        extremes.astype(np.float32),
        directions.astype(np.float32),
        lookback=lookback
    )

    # divido in train-validation
    X_train, X_validation, y_train, y_validation, dates_train, dates_validation = split_stratified(
        X_trv,
        y_trv,
        dates_trv,
        test_size=0.2,
        verbose=True
    )

    # bilancio con oversampling della classe di minoranza (1)
    X_train_bal, y_train_bal = oversample_mtl(X_train, y_train)  # bal = balanced
    
    # 2. creo il file dei risultati
    if not os.path.exists(results_path):
        print(f"Creating new result file at {results_path}")
        results = get_new_results()
    else:
        print(f"Loading result file at {results_path}")
        results = pd.read_csv(results_path)

    # 3. creo le variabili che servono per il training (dati e parametri)
    print(space)
    try:
        use_class_weight = space['use_class_weight']
        if use_class_weight:
            print("Using class weight for training")
    except KeyError:
        use_class_weight = False
        
    
    y_train_bal_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_train_bal]
    y_validation_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_validation]
    y_test_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_test]

    # 4. inizializza le loss a 0 e crea i tempi di inizio e l'id esperimento
    train_losses: List[float] = []
    val_losses = []
    test_losses = []

    optimizer_name = space['optimizer']['name']
    assert optimizer_name in {'adam', 'adadelta'}

    start_time = int(round(time.time()))
    experiment_id = str(uuid.uuid4())

    # 5. addestra e testa il modello per n_runs volte
    for i in range(n_runs):
        model = create_model_mtl(space, lookback, n_features)

        # 5.1 crea l'optimizer
        if optimizer_name == 'adam':
            learning_rate = space['optimizer']['lr']
            optimizer = keras.optimizers.Adam(lr=learning_rate)
        elif optimizer_name == 'adadelta':
            optimizer = 'adadelta'
            learning_rate = 1.0
        else:
            raise ValueError(f"Invalid optimizer name {optimizer_name}")

        if i == 0:
            print(
                f"Hyper parameters: bptt={bptt}, optimizer={optimizer_name}, learning_rate={learning_rate}"
            )
            model.summary()

        # 5.2 compila il modello
        model.compile(
            optimizer=optimizer,
            loss=['categorical_crossentropy', 'categorical_crossentropy'],
        )

        # 5.3 parametri per l'Early Stopping
        min_delta = float(space['early_stop']['min_delta'])
        patience = int(space['early_stop']['patience'])

        early_stop_cb = keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=min_delta,
            patience=patience,
            restore_best_weights=True)

        print(f"Iteration {i}")
        print("Fitting model")

#         if use_class_weight:
#             print(f"Class weights: {class_weights}")
#             history: keras.callbacks.History = model.fit(
#                 x=X_train,
#                 y=y_t,
#                 epochs=max_epochs,
#                 batch_size=batch_size,
#                 validation_data=(X_validation, y_v),
#                 callbacks=[early_stop_cb],
#                 shuffle=True,
#                 verbose=verbose,
#                 class_weight=class_weights)
#         else:
#             history: keras.callbacks.History = model.fit(  # type: ignore
#                 x=X_train,
#                 y=y_t,
#                 epochs=max_epochs,
#                 batch_size=batch_size,
#                 validation_data=(X_validation, y_v),
#                 callbacks=[early_stop_cb],
#                 shuffle=True,
#                 verbose=verbose)

        # 5.4 addestra il modello
        history: keras.callbacks.History = model.fit(  # type: ignore
            x=X_train_bal,
            y=y_train_bal_cat,
            epochs=max_epochs,
            batch_size=batch_size,
            validation_data=(X_validation, y_validation_cat),
            callbacks=[early_stop_cb],
            shuffle=True,
            verbose=verbose
        )

        # 5.5 valutalo su training e validation
        train_epochs = len(history.history['loss'])

        min_index = np.argmin(history.history['val_loss'])

        curr_train_loss_total = history.history['loss'][min_index]
        curr_train_loss_extreme = history.history['extreme_loss'][min_index]
        curr_train_loss_up_down = history.history['up_down_loss'][min_index]
        train_losses.append(curr_train_loss_total)

        curr_validation_loss_total = history.history['val_loss'][min_index]
        curr_validation_loss_extreme = history.history['val_extreme_loss'][
            min_index]
        curr_validation_loss_up_down = history.history['val_up_down_loss'][
            min_index]

        # 5.6 valutalo su test set
        current_test_loss = model.evaluate(X_test, y_test_cat, batch_size=batch_size, verbose=0)

        curr_test_loss_total, curr_test_loss_extreme, curr_test_loss_up_down = current_test_loss
        test_losses.append(curr_test_loss_total)

        # 5.7 salva train, validation e test set performance
        # train
        print("Evaluating the model on the training set")
        probs = model.predict(X_train_bal, batch_size=batch_size)
        y_pred_extreme, y_pred_up_down = [np.argmax(p, axis=1) for p in probs]
        
        y_true_extreme, y_true_up_down = y_train_bal

        r_extreme = get_tptnfpfn(y_true_extreme, y_pred_extreme)
        r_up_down = get_tptnfpfn(y_true_up_down, y_pred_up_down)

        row = pd.Series({
            'dataset': 'train',
            'optimizer': optimizer_name,
            'start_time': start_time,
            'experiment_id': experiment_id,
            'trial': i,
            'bptt': bptt,
            'lr': learning_rate,
            'n_layers': space['layers']['num_layers']['how_many'],
            'n_cells_1': space['layers']['num_layers']['n_cells_1'],
            'n_cells_2': space['layers']['num_layers']['n_cells_2'],
            'dropout': space['layers']['input_dropout'],
            'loss_extreme': curr_train_loss_extreme,
            'loss_up_down': curr_train_loss_up_down,
            'loss_volume': np.nan,
            'f1_validation_extreme': np.nan,
            'f1_validation_up_down': np.nan,
            'tp_extreme': r_extreme['tp'],
            'tn_extreme': r_extreme['tn'],
            'fp_extreme': r_extreme['fp'],
            'fn_extreme': r_extreme['fn'],
            'tp_up_down': r_up_down['tp'],
            'tn_up_down': r_up_down['tn'],
            'fp_up_down': r_up_down['fp'],
            'fn_up_down': r_up_down['fn'],
            'train_epochs': train_epochs,
            'es_min_delta': min_delta,
            'es_patience': patience,
            'use_class_weight': use_class_weight,
            'output': sigmoid_or_softmax,
        })
        results = results.append(row, ignore_index=True)

        # validation
        print("Evaluating the model on the validation set")
        probs = model.predict(X_validation, batch_size=batch_size)
        y_pred_extreme, y_pred_up_down = [np.argmax(p, axis=1) for p in probs]
        
        y_true_extreme, y_true_up_down = y_validation
        
        r_extreme = get_tptnfpfn(y_true_extreme, y_pred_extreme)
        r_up_down = get_tptnfpfn(y_true_up_down, y_pred_up_down)

        # ora voglio ottimizzare rispetto all'F1 score della classe +1 -->
        # utilizzo questo come metrica
        f1_extreme = compute_performance(r_extreme['tp'], r_extreme['tn'],
                                         r_extreme['fp'], r_extreme['fn'])
        f1_up_down = compute_performance(r_up_down['tp'], r_up_down['tn'],
                                         r_up_down['fp'], r_up_down['fn'])

#         # se l'f1 è zero, allora l'esperimento è fallito
#         if np.isnan(f1_extreme) or np.isnan(f1_up_down):
#             return {
#                 'status': STATUS_FAIL,
#                 'loss': 1e+30,
#             }

        val_losses.append(curr_validation_loss_total)

        row = pd.Series({
            'dataset': 'validation',
            'optimizer': optimizer_name,
            'start_time': start_time,
            'experiment_id': experiment_id,
            'trial': i,
            'bptt': bptt,
            'lr': learning_rate,
            'n_layers': space['layers']['num_layers']['how_many'],
            'n_cells_1': space['layers']['num_layers']['n_cells_1'],
            'n_cells_2': space['layers']['num_layers']['n_cells_2'],
            'dropout': space['layers']['input_dropout'],
            'loss_extreme': curr_validation_loss_extreme,
            'loss_up_down': curr_validation_loss_up_down,
            'loss_volume': np.nan,
            'f1_validation_extreme': f1_extreme,
            'f1_validation_up_down': f1_up_down,
            'tp_extreme': r_extreme['tp'],
            'tn_extreme': r_extreme['tn'],
            'fp_extreme': r_extreme['fp'],
            'fn_extreme': r_extreme['fn'],
            'tp_up_down': r_up_down['tp'],
            'tn_up_down': r_up_down['tn'],
            'fp_up_down': r_up_down['fp'],
            'fn_up_down': r_up_down['fn'],
            'train_epochs': train_epochs,
            'es_min_delta': min_delta,
            'es_patience': patience,
            'use_class_weight': use_class_weight,
            'output': sigmoid_or_softmax,
        })
        results = results.append(row, ignore_index=True)

        # testing
        print("Evaluating the model on the test set")
        probs = model.predict(X_test, batch_size=batch_size)
        y_pred_extreme, y_pred_up_down = [np.argmax(p, axis=1) for p in probs]
        
        y_true_extreme, y_true_up_down = y_test

        r_extreme = get_tptnfpfn(y_true_extreme, y_pred_extreme)
        r_up_down = get_tptnfpfn(y_true_up_down, y_pred_up_down)

        row = pd.Series({
            'dataset': 'test',
            'optimizer': optimizer_name,
            'start_time': start_time,
            'experiment_id': experiment_id,
            'trial': i,
            'bptt': bptt,
            'lr': learning_rate,
            'n_layers': space['layers']['num_layers']['how_many'],
            'n_cells_1': space['layers']['num_layers']['n_cells_1'],
            'n_cells_2': space['layers']['num_layers']['n_cells_2'],
            'dropout': space['layers']['input_dropout'],
            'loss_extreme': curr_test_loss_extreme,
            'loss_up_down': curr_test_loss_up_down,
            'loss_volume': np.nan,
            'f1_validation_extreme': np.nan,
            'f1_validation_up_down': np.nan,
            'tp_extreme': r_extreme['tp'],
            'tn_extreme': r_extreme['tn'],
            'fp_extreme': r_extreme['fp'],
            'fn_extreme': r_extreme['fn'],
            'tp_up_down': r_up_down['tp'],
            'tn_up_down': r_up_down['tn'],
            'fp_up_down': r_up_down['fp'],
            'fn_up_down': r_up_down['fn'],
            'train_epochs': train_epochs,
            'es_min_delta': min_delta,
            'es_patience': patience,
            'use_class_weight': use_class_weight,
            'output': sigmoid_or_softmax,
        })
        results = results.append(row, ignore_index=True)

        # scrivo sul CSV
        print("Scrivo i risultati")
        results.to_csv(results_path, index=None)

        # cancello la memoria
        keras.backend.clear_session()
        gc.collect()

    # 6. media delle loss
    train_loss_mean = np.mean(train_losses)
    train_loss_variance = np.var(train_losses)

    val_loss_mean = np.mean(val_losses)
    val_loss_variance = np.var(val_losses)

    test_loss_mean = np.mean(test_losses)
    test_loss_variance = np.var(test_losses)

    # 7. return del dict di loss
    return {
        'status': STATUS_OK,
        'loss': val_loss_mean,
        'loss_variance': val_loss_variance,
        'true_loss': test_loss_mean,
        'true_loss_variance': test_loss_variance,
        'train_loss_mean': train_loss_mean,
        'train_loss_variance': train_loss_variance,
    }

### 3.1 Azione con minima volatilità - ottimizzazione iperparametri

Cominciamo con l'azione meno volatile. Per prima cosa, bisogna trovare la struttura migliore della rete, quindi usiamo l'ottimizzazione degli iperparametri per farlo.

In [0]:
s_type = 'min_vol'
results_path = os.path.join(base_path, 'results', f"results_{s_type}_with_sentiment.csv")
print(f"RESULTS path: {results_path}")

def objective_fn(space, **kwargs):
    return run_experiment(
        space,
        stock_type=s_type,
        max_epochs=MAX_EPOCHS,
        data=data_all[s_type],
        extremes=extremes_all[s_type]['95'],
        directions=directions_all[s_type],
        n_runs=5,
        verbose=0,
        results_path=results_path
    )

Ora creo lo spazio di ricerca per questa azione:

In [0]:
optimizer_space = hp.choice('opt_name', [
    {
        'name': 'adam',
        'lr': hp.uniform('lr_adam', low=1e-5, high=1e-2)
    },
   {
       'name': 'adadelta',
   },
])

layer_space = {
    'num_layers': {
        'how_many': 2,
        'n_cells_1': 21, # hp.quniform('number_of_cells', low=10, high=100, q=2), # 96
        'n_cells_2': hp.quniform('number_of_cells_2', low=5, high=20, q=1),
    },
    'input_dropout': hp.uniform('dropout_kill_rate', low=0.0, high=0.4),
}

early_stop_space = {
    'patience': hp.quniform('early_stop_patience', low=5, high=25, q=1),
    'min_delta': hp.quniform('early_stop_min_delta', low=1e-4, high=1e-2, q=2e-4)
}

opt_space = {
    'optimizer': optimizer_space,
    'layers': layer_space,
    'bptt': hp.quniform('bptt_len', low=10, high=120, q=1),
    'early_stop': early_stop_space,
    'use_class_weight': False,
}

Ora facciamo la ricerca degli iperparametri:

In [0]:
print("COMINCIAMO\n")
n_trials = 50

load_trials = False
trials_filename = os.path.join(base_path, 'results', f"trials_{s_type}_with_sentiment_2_layers.pickle")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_with_sentiment_2_layers.pickle")

if load_trials:
    print("Loading trials from last execution")
    try:
        with open(trials_filename, 'rb') as infile:
            trials = pickle.load(infile)
            print(f"len(trials) = {len(trials)}")
    except FileNotFoundError:
        print("Trials file not found, starting with new trial object")
        trials = hy.Trials()
else:
    print("Starting new experiment with new trials object")
    trials = hy.Trials()


best = fmin(
    objective_fn,
    opt_space,
    algo=tpe.suggest,
    max_evals=len(trials) + n_trials,
    trials=trials
)

with open(trials_filename, 'wb') as outfile:
    pickle.dump(trials, outfile)

with open(best_filename, 'wb') as outfile:
    pickle.dump(best, outfile)

print("FINITO")

### 3.3 Azione massima volatilità - ottimizzazione iperparametri

Ora, facciamo la stessa cosa per l'azione con più volatilità. Ricorda che qui stiamo solo cercando la configurazione ottimale della rete e degli iperparametri, mentre il vero training-testing lo farò in un altro notebook.

In [0]:
s_type = 'max_vol'
results_path = os.path.join(base_path, 'results', f"results_{s_type}_with_sentiment.csv")
print(f"RESULTS path: {results_path}")

def objective_fn(space, **kwargs):
    return run_experiment(
        space,
        stock_type=s_type,
        max_epochs=MAX_EPOCHS,
        data=data_all[s_type],
        extremes=extremes_all[s_type]['95'],
        directions=directions_all[s_type],
        n_runs=5,
        verbose=0,
        results_path=results_path
    )

In [0]:
optimizer_space = hp.choice('opt_name', [
    {
        'name': 'adam',
        'lr': hp.uniform('lr_adam', low=1e-5, high=1e-2)
    },
   {
       'name': 'adadelta',
   },
])

layer_space = {
    'num_layers': {
        'how_many': 2,
        'n_cells_1': 21, # hp.quniform('number_of_cells', low=10, high=100, q=2), # 96
        'n_cells_2': hp.quniform('number_of_cells_2', low=5, high=20, q=1),
    },
    'input_dropout': hp.uniform('dropout_kill_rate', low=0.0, high=0.4),
}

early_stop_space = {
    'patience': hp.quniform('early_stop_patience', low=5, high=25, q=1),
    'min_delta': hp.quniform('early_stop_min_delta', low=1e-4, high=1e-2, q=2e-4)
}

opt_space = {
    'optimizer': optimizer_space,
    'layers': layer_space,
    'bptt': hp.quniform('bptt_len', low=10, high=120, q=1),
    'early_stop': early_stop_space,
    'use_class_weight': False,
}

In [0]:
print("COMINCIAMO\n")
n_trials = 50

load_trials = True
trials_filename = os.path.join(base_path, 'results', f"trials_{s_type}_with_sentiment_2_layers.pickle")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_with_sentiment_2_layers.pickle")

if load_trials:
    print("Loading trials from last execution")
    try:
        with open(trials_filename, 'rb') as infile:
            trials = pickle.load(infile)
            print(f"len(trials) = {len(trials)}")
    except FileNotFoundError:
        print("Trials file not found, starting with new trial object")
        trials = hy.Trials()
else:
    print("Starting new experiment with new trials object")
    trials = hy.Trials()


best = fmin(
    objective_fn,
    opt_space,
    algo=tpe.suggest,
    max_evals=len(trials) + n_trials,
    trials=trials
)

with open(trials_filename, 'wb') as outfile:
    pickle.dump(trials, outfile)

with open(best_filename, 'wb') as outfile:
    pickle.dump(best, outfile)

print("FINITO")

## Scratch

Ora vedo come variare il threshold per ottenere le curve ROC

In [0]:
def loss_function(theta, recall, fpr):
    """The loss function L = theta * (1 - recall) + (1 - theta) * fpr"""
    assert theta >= 0.0 and theta <= 1.0
    
    return theta * (1 - recall) + (1 - theta) * fpr


def utility_function(theta, loss):
    """The utility function U = min(theta, 1 - theta) - loss"""
    return min(theta, 1 - theta) - loss


def to_binary(prob: np.ndarray, thresh: float):
    assert thresh <= 1.0 and thresh >= 0.0
    
    return (prob >= thresh).astype(np.int8)


def recall_fpr_kss_precision(y_true, y_pred):
    """Compute recall, fpr and KSS score."""
    tp = np.sum(np.logical_and(y_true, y_pred))
    tn = np.sum(np.logical_and(
        np.logical_not(y_true),
        np.logical_not(y_pred)
    ))
    fp = np.sum(np.logical_and(
        np.logical_not(y_true),
        y_pred
    ))
    fn = np.sum(np.logical_and(
        y_true,
        np.logical_not(y_pred)
    ))
    
    recall = tp / (tp + fn)  # TP / (TP + FN)
    fpr = fp / (fp + tn)  # FP / (FP + TN)
    precision = tp / (tp + fp)
    
    kss = recall - fpr
    
    return recall, fpr, kss, precision

In [0]:
w_t = np.arange(0, 1, 1e-3)
theta = 0.5

In [0]:
def optimize_wt(w, theta, probabilities, y_true, verbose=False):
    """Get the best threshold for the class 1 probability."""
    recalls = np.zeros((w.shape[0], ), dtype=np.float64)
    fprs = copy.deepcopy(recalls)
    ksss = copy.deepcopy(recalls)
    precisions = copy.deepcopy(recalls)
    losses = copy.deepcopy(recalls)
    utilities = copy.deepcopy(recalls)

    for i, thresh in enumerate(w):
        if i % 200 == 0 and verbose:
            print(f"iteration {i} / {len(w_t)}")

        y_pred = to_binary(probabilities, thresh).astype(np.int8)
        recall, fpr, kss, precision = recall_fpr_kss_precision(y_true, y_pred)
        loss = loss_function(theta, recall, fpr)
        utility = utility_function(theta, loss)

        recalls[i] = recall
        precisions[i] = precision
        ksss[i] = kss
        fprs[i] = fpr
        losses[i] = loss
        utilities[i] = utility

    if verbose:
        print("Finished!")

    return recalls, fprs, ksss, precisions, losses, utilities

In [0]:
probabilities_extreme_train = model.predict(X_train, batch_size=batch_size)[0][:, 1]
probabilities_up_down_train = model.predict(X_train, batch_size=batch_size)[1][:, 1]

probabilities_extreme_validation = model.predict(X_validation, batch_size=batch_size)[0][:, 1]
probabilities_up_down_validation = model.predict(X_validation, batch_size=batch_size)[1][:, 1]

# per le ROC sugli estremi
recalls_train, fprs_train, ksss_train, precisions_train, losses_train, utilities_train = \
optimize_wt(w_t, theta, probabilities_extreme_train, y_train[0].astype(np.int8))

recalls_validation, fprs_validation, ksss_validation, \
precisions_validation, losses_validation, utilities_validation = \
optimize_wt(w_t, theta, probabilities_extreme_validation, y_validation[0].astype(np.int8))

# per le ROC sul su-giù
recalls_train_ud, fprs_train_ud, ksss_train_ud, precisions_train_ud, losses_train_ud, utilities_train_ud = \
optimize_wt(w_t, theta, probabilities_up_down_train, y_train[1].astype(np.int8))

recalls_validation_ud, fprs_validation_ud, ksss_validation_ud, \
precisions_validation_ud, losses_validation_ud, utilities_validation_ud = \
optimize_wt(w_t, theta, probabilities_up_down_validation, y_validation[1].astype(np.int8))

In [0]:
fig, ax = pl.subplots(nrows=1, ncols=2, figsize=(20, 10))
fig.suptitle(f"{s_type} stock", fontsize=16)

# primo plot: EXTREMES
# train set
i_sorted = np.argsort(fprs_train)

x_extreme = fprs_train[i_sorted]
y_extreme = recalls_train[i_sorted]
ax[0].plot(
    x_extreme,
    y_extreme,
    color='navy',
    label='train'
)

i_sweet = np.argmax(utilities_train)
best_x = fprs_train[i_sweet]
best_y = recalls_train[i_sweet]

ax[0].plot(
    best_x,
    best_y,
    marker='s',
    markersize=5,
    color='navy',
    label='train - best'
)

# validation set
i_sorted = np.argsort(fprs_validation)

x = fprs_validation[i_sorted]
y = recalls_validation[i_sorted]
ax[0].plot(
    x,
    y,
    color='forestgreen',
    label='validation'
)

i_sweet = np.argmax(utilities_validation)
best_x = fprs_validation[i_sweet]
best_y = recalls_validation[i_sweet]

ax[0].plot(
    best_x,
    best_y,
    marker='s',
    markersize=5,
    color='forestgreen',
    label='validation - best'
)

ax[0].set_title("Extreme prediction")

# ------------------------------------------------------- #
# secondo plot: UP-DOWN
# train set
i_sorted = np.argsort(fprs_train_ud)

x = fprs_train_ud[i_sorted]
y = recalls_train_ud[i_sorted]
ax[1].plot(
    x,
    y,
    color='navy',
    label='train'
)

i_sweet = np.argmax(utilities_train)
best_x = fprs_train_ud[i_sweet]
best_y = recalls_train_ud[i_sweet]

ax[1].plot(
    best_x,
    best_y,
    marker='s',
    markersize=5,
    color='navy',
    label='train - best'
)

# validation set
i_sorted = np.argsort(fprs_validation_ud)

x = fprs_validation_ud[i_sorted]
y = recalls_validation_ud[i_sorted]
ax[1].plot(
    x,
    y,
    color='forestgreen',
    label='validation'
)

i_sweet = np.argmax(utilities_validation)
best_x = fprs_validation_ud[i_sweet]
best_y = recalls_validation_ud[i_sweet]

ax[1].plot(
    best_x,
    best_y,
    marker='s',
    markersize=5,
    color='forestgreen',
    label='validation - best'
)

ax[1].set_title("Up-Down prediction")

# la linea del random classifier
for a in ax:
    a.plot([0, 1], [0, 1], color='black', linewidth=0.5)
    a.legend(loc='lower right', fontsize=14)
    a.set_xlim([0, 1.1])
    a.set_ylim([0, 1.1])
    a.set_xlabel('FPR', fontsize=16)
    a.set_ylabel('Recall', fontsize=16)

sns.despine()