In [0]:
ENV = 'colab'  # 'colab'
if ENV == 'colab':
    !pip install -q PyDrive imbalanced-learn ipdb hyperopt

# Rete neurale per extreme returns su 2 azioni - con sentiment

Questo notebook contiene la parte di rete neurale per confronto con l'analisi statistica. Qui faremo *solo* l'ottimizzazione degli iperparametri, l'addestramento finale con i parametri ottimali trovati sarà fatta in un altro notebook.

Il flusso è il seguente:

- [x] utilizzo del dataset *S&P500* con la massima ampiezza storica disponibile (2005 - 2018)
- [x] calcolo dei log returns
- [x] selezione di due stocks, quelle con la minima e la massima volatilità in nel training set considerato
- [x] creazione estremi al 95%
- [x] oversampling con due possibili strategie: replicare le istanze positive, o replicarle con aggiunta di rumore gaussiano
- [x] addestramento rete con hyperparameter optimization
- [x] ripetizione di ottimizzazione iperparametri per tutte e due le azioni con aggiunta di sentiment

Nell'altro notebook dovrò fare:
- [ ] utilizzo stesse metriche (ROC, KSS, Precision, Recall, Utility) che nel paper
- [ ] confronto con i risultati del modello probabilistico
- [ ] conclusioni

In [0]:
import os
import gc
import time
import datetime
from typing import Any, Dict, List, Tuple, Union
import pickle
import copy
import pprint
import uuid

import numpy as np
import pandas as pd
import pandas.testing as pt
import scipy.integrate
import sklearn.metrics as sm
import sklearn.preprocessing as skpp
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler
import hyperopt as hy
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK

import keras

import matplotlib.pyplot as pl
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns

import ipdb

%pdb on

In [0]:
if ENV == 'colab':
    from google.colab import drive
    drive.mount('/gdrive')

Un po' di dichiarazioni utili per il seguito

In [0]:
stock_type = ['min_vol', 'max_vol']
return_type = ['pos', 'neg', 'abs']
impact_type = ['no_sentiment', 'with_sentiment']
q_type = '95'
rs = 42  # random state
MAX_EPOCHS = 1000

stock_codes = {
    'min_vol': '9CE4C7',
    'max_vol': 'E28F22'
}  # già trovate in Paper-azioni.ipynb

stock_colors = {
    'min_vol': 'palegoldenrod',
    'max_vol': 'coral',
}

dataset_colors = {
    'train': 'navy',
    'validation': 'forestgreen',
}

# i giorni sono i primi disponibili in quel mese nei dati
split_dates = {
    'subprime-crisis': datetime.datetime(2007, 1, 3), # subprime crisis
    'subprime-crisis-start': datetime.datetime(2007, 1, 3), # subprime crisis
    'subprime-crisis-halfway': datetime.datetime(2008, 9, 2),
    'subprime-crisis-end': datetime.datetime(2010, 1, 4),
    'eu-debt': datetime.datetime(2011, 1, 3), # EU sovereign debt crisis
    'eu-debt-halfway': datetime.datetime(2012, 1, 3), # EU sovereign debt crisis
    'last_train': datetime.datetime(2017, 1, 3), 
}

## 1. Importazione dei dati 

Per importare i dati dobbiamo caricarli, e poi usare la stategia "taglia-e-cuci" usata in `Paper-azioni.ipynb`.

In [0]:
if ENV == 'colab':
    data_path = '/gdrive/My Drive/OptiRisk Thesis/data'
    base_path = '/gdrive/My Drive/OptiRisk Thesis/experiments/11_final_experiment'
else:
    data_path = "/Users/pietro/Google Drive/OptiRisk Thesis/data"
    base_path = "/Users/pietro/Google Drive/OptiRisk Thesis/experiments/11_final_experiment"

prices_path = os.path.join(data_path, 'prices', 'adjusted_prices_volume.csv')
ta_dir = os.path.join(data_path, 'technical_features', 'features_all_years')
impact_path = os.path.join(data_path, 'sentiment', 'impactFinal.csv')
print(f"BASE path: {base_path}")
print(f"TA dir: {ta_dir}")

Conversione delle date e settaggio dell'index del dataframe

In [0]:
prices = pd.read_csv(prices_path)
prices.loc[:, 'date'] = pd.to_datetime(prices['date'], format="%Y%m%d")
prices.index = prices['date']
prices.drop(columns=['date'], inplace=True)
prices.head()

Trasformiamola un una serie temporale, ogni riga una data, ogni colonna un'azione.

I prezzi:

In [0]:
prices_ts = prices.pivot(columns='ravenpackId', values='close')
prices_ts_no_nan = prices_ts.dropna(axis='columns', how='any', inplace=False)
prices_ts_no_nan.head()

I volumi:

In [0]:
volume_ts = prices.pivot(columns='ravenpackId', values='volume')
volume_ts_no_nan = volume_ts.loc[:, prices_ts_no_nan.columns]
pt.assert_index_equal(prices_ts_no_nan.columns, volume_ts_no_nan.columns, check_names=False)
volume_ts_no_nan.head()

Ora carico l'impact score:

In [0]:
impact = pd.read_csv(impact_path)
impact.loc[:, 'date'] = pd.to_datetime(impact['date'], format="%Y%m%d")
impact.index = impact['date']
impact.drop(columns=['date'], inplace=True)
impact.head()

In [0]:
impact_ts = impact.pivot(columns='companyId', values=['PosImpact', 'NegImpact'])
impact_ts_no_nan = impact_ts.dropna(axis='columns', how='any', inplace=False)
impact_ts_no_nan.head()

Ora calcoliamo i log-returns, le direzioni, i volumi e gli impact:

In [0]:
log_returns = np.log(prices_ts_no_nan).diff(periods=1).iloc[1:, :]
directions_ts_no_nan = prices_ts_no_nan.diff(periods=1).iloc[1:, :]
prices_ts_no_nan = prices_ts_no_nan.iloc[1:, :]
volume_ts_no_nan = volume_ts_no_nan.iloc[1:, :]

impact_ts_no_nan = impact_ts_no_nan.loc[prices_ts_no_nan.index]

pt.assert_index_equal(prices_ts_no_nan.index, volume_ts_no_nan.index)
pt.assert_index_equal(prices_ts_no_nan.index, log_returns.index)
pt.assert_index_equal(prices_ts_no_nan.index, directions_ts_no_nan.index)
pt.assert_index_equal(prices_ts_no_nan.index, impact_ts_no_nan.index)

Mi conviene creare una funzione che standardizzi le features, visto che poi ne avrò più di una (es: returns + volume).

In [0]:
def only_train_notime(feature: pd.Series) -> pd.Series:
    """Just return the training part of a Series."""
    f = feature[np.logical_or(
        feature.index < split_dates['subprime-crisis-halfway'],
        np.logical_and(
            feature.index >= split_dates['eu-debt-halfway'],
            feature.index < split_dates['last_train']
        )
    )]

    return f

def standardize(feature: pd.Series) -> pd.Series:
    """Standardize a feature by computing the statistics on the training set."""
    # prendo solo la parte di training, perdendo ogni riferimento alla
    # sequenza temporale
    tmp_feature_train = only_train_notime(feature)

    scaler = skpp.RobustScaler()
    scaler.fit(tmp_feature_train.values.reshape(-1, 1))

    result = pd.Series(
        data=scaler.transform(feature.values.reshape(-1, 1)).flatten(),
        index=feature.index
    )
    
    return result

Ora creo i thresholds:

In [0]:
# ora creo i dati per i returns (non standardizzati), i thresholds e i volumi (standardizzati)
lr_train_notime = dict()
lr_test_notime = dict()
returns_train_notime = dict()

# aggiungiamo i dati in modalità taglia-e-cuci
for s_type, s_code in stock_codes.items():
    # training set
    lr_current = log_returns.loc[:, s_code]
    lr_train_notime[s_type] = only_train_notime(lr_current)
    
    # returns train, tutti POSITIVI
    returns_train_notime[s_type] = {
        'pos': lr_train_notime[s_type][lr_train_notime[s_type] > 0.0],
        'neg': -(lr_train_notime[s_type][lr_train_notime[s_type] < 0.0]),
        'abs': lr_train_notime[s_type].abs()
    }

    

# ora creo i threshold
thresholds = {
    s_type: {
        ret_type: {
            q_type: returns_train_notime[s_type][ret_type].quantile(0.95)
        }
        for ret_type in return_type
    }
    for s_type in stock_type
}

ed infine creo i DataFrame e gli arrays che contengono tutti gli estremi e tutti i dati.

Le features che qui utilizziamo sono:

- log-returns standardizzati
- volume scambiato standardizzato
- tutte le features di TA che ci sono nel white paper di Douglas

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for s_type, s_code in stock_codes.items():
    print(f"Stock type: {s_type}")
    print("-"*30)
    features[s_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[s_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[s_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
    print("adding positive and negative sentiment impact")
    features[s_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
    features[s_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]
    
    
    print("-" * 30)
    print("")

In [0]:
extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for s_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[s_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[s_type][name] for name in feature_names] + \
              [features[s_type]['pos_impact'], features[s_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names + ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[s_type] = tmp_df
    extremes_all[s_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[s_type]['pos'][q_type],
        lr <= -thresholds[s_type]['neg'][q_type],
    )
    
    extremes_all[s_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[s_type] = direction

## 2. Creazione dataset train-test per TensorFlow

Ora che ho i thresholds, posso creare il dataset vero e proprio, cioè:

- X: cubo dati
- y: estremo si/no

Per prima cosa, creo delle funzioni che mi creano i dati:

In [0]:
# testata, funziona con array, Series e DataFrame
def rolling_window(data: np.ndarray,
                   start: int,
                   end: int,
                   lookback: int) -> np.ndarray:
    """
    Create a rolling window view of data, starting at index start, finishing
    at index end, with loockback days of bptt.
    
    Parameters
    ----------
    data: series, dataframe or array
        the data, containing one row for each time point and one column for each feature
        
    start: int
        starting index in the data
        
    end: int
        index where the whole thing ends, data[end] is **excluded**
        
    lookback: int
        length of the lookback period
        
    Returns
    -------
    X: np.ndarray
        array of shape(n_points, lookback, n_features)
    """
    assert lookback < data.shape[0]  # lookback sano
    assert start - lookback + 1 >= 0  # lookback sano
    
    n_features = data.shape[1]
    n_points = end - start
    
    X = np.zeros((n_points, lookback, n_features), dtype = data.dtype)
    
    # range strano per l'indicizzazione numpy
    for i, t in enumerate(range(start + 1, end + 1)):
        X[i, :, :] = data[t - lookback:t, :]
        
    return X


# testata, funziona hehehe
def rolling_window_xyd(data: Union[pd.Series, pd.DataFrame],
                      targets: List[pd.Series],
                      start: int,
                      end: int,
                      lookback: int) -> Tuple[np.ndarray, List[np.ndarray], pd.Series]:
    """
    Create X, y and dates in a single shot.
    The returned dates are relative to the y array(s).
    """
    if isinstance(data, pd.Series):
        my_data = data.values.reshape(-1, 1)
    elif isinstance(data, pd.DataFrame):
        my_data = data.values
    else:
        raise TypeError("data should be a pandas Series or Dataframe")

    X = rolling_window(my_data, start, end, lookback)
    
    if not isinstance(targets, list):
        raise TypeError("target must be a list of pandas Series")
    if not all(isinstance(t, pd.Series) for t in targets):
        raise TypeError("all targets should be pandas Series")
    if not all(isinstance(t.index, pd.DatetimeIndex) for t in targets):
        raise TypeError("index of target should be a pandas DatetimeIndex")
        
    y = [t.values[start + 1:end + 1] for t in targets]
    dates = pd.Series(data=targets[0].index[start + 1: end + 1])
        
    return X, y, dates


# TESTATO: funziona
def create_Xyd(returns: Union[pd.Series, pd.DataFrame],
               extremes: pd.Series,
               directions: pd.Series,
               lookback: int) -> Tuple[
    np.ndarray, np.ndarray, List[np.ndarray], List[np.ndarray], pd.Series, pd.Series
]:
    """
    Create the X, y and dates arrays for the ANN.
    """
    test_start_1 = returns.index.get_loc(split_dates['subprime-crisis-halfway'])
    test_end_1 = returns.index.get_loc(split_dates['eu-debt-halfway'])
    test_start_2 = returns.index.get_loc(split_dates['last_train'])

    # TRAIN
    tmp_X_train_1, tmp_y_train_1, tmp_dates_train_1 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=lookback - 1,  # sempre lookback - 1 se il primo iniziale
        end=test_start_1,
        lookback=lookback
    )

    tmp_X_train_2, tmp_y_train_2, tmp_dates_train_2 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_end_1,  # sempre lookback - 1 se il primo iniziale
        end=test_start_2,
        lookback=lookback
    )
    
    assert len(tmp_y_train_1) == len(tmp_y_train_2)
    
    X_train = np.concatenate([tmp_X_train_1, tmp_X_train_2])
    y_train = [np.concatenate([tmp_y_train_1[i], tmp_y_train_2[i]]) for i in range(len(tmp_y_train_1))]
    dates_train = pd.concat([tmp_dates_train_1, tmp_dates_train_2], axis=0, ignore_index=True).values
    assert X_train.shape[0] == dates_train.shape[0]
    assert all(yy.shape[0] == X_train.shape[0] for yy in y_train)

    # TEST
    tmp_X_test_1, tmp_y_test_1, tmp_dates_test_1 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_start_1,  # sempre lookback - 1 se il primo iniziale
        end=test_end_1,
        lookback=lookback
    )
    
    tmp_X_test_2, tmp_y_test_2, tmp_dates_test_2 = rolling_window_xyd(
        returns,
        [extremes, directions],
        start=test_start_2,  # sempre lookback - 1 se il primo iniziale
        end=returns.shape[0] - 1,
        lookback=lookback
    )
  
    X_test = np.concatenate([tmp_X_test_1, tmp_X_test_2])
    y_test = [np.concatenate([tmp_y_test_1[i], tmp_y_test_2[i]]) for i in range(len(tmp_y_test_1))]
    dates_test = pd.concat([tmp_dates_test_1, tmp_dates_test_2], axis=0, ignore_index=True).values
    assert X_test.shape[0] == dates_test.shape[0]
    assert all(yy.shape[0] == X_test.shape[0] for yy in y_test)

    return X_train, X_test, y_train, y_test, dates_train, dates_test


def split_stratified(X: np.ndarray,
                     y: List[np.ndarray],
                     dates: np.ndarray,
                     test_size=0.2,
                     random_state=rs,
                     verbose=False):
    """
    Split a dataset in a stratified fashion on the target variable y[0].
    """
    assert X.ndim == 3
    # divido in train-validation, lo faccio prendendo gli indici dagli estremi y/n con un
    # ShuffleSplit che divide a caso
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    n_samples = X.shape[0]
    n_features = X.shape[2]
    
    XX = np.zeros(n_samples, dtype=np.int8)
    
    if verbose:
        for i in range(len(y)):
            vals, counts = np.unique(y[i], return_counts=True)
            for v, c in zip(vals, counts):
                print(f"y[{i}] has {c} elements of class {v}")
    
    train_index, test_index = next(splitter.split(XX, y[0]))
    
    X_train = X[train_index]
    X_validation = X[test_index]
    
    y_train = [yy[train_index] for yy in y]
    y_validation = [yy[test_index] for yy in y]
    
    dates_train = dates[train_index]
    dates_validation = dates[test_index]

    return X_train, X_validation, y_train, y_validation, dates_train, dates_validation


def oversample_mtl(X: np.ndarray, y: List[np.ndarray], random_state=rs, dt=np.float32):
    """Oversample a dataset on the positive 1 class."""
    assert X.dtype == dt
    assert X.ndim == 3
    assert isinstance(y, list) and all(yy.ndim == 1 for yy in y) and all(yy.dtype == dt for yy in y)
    
    # oversample
    ro = RandomOverSampler(random_state=random_state)
    nx = X.shape[0]
    indexes = np.arange(nx).reshape(nx, 1)
    
    indexes_resampled, y_resampled = ro.fit_resample(indexes, y[0])
    ir = indexes_resampled.flatten()
    
    X_resampled = X[ir]
    y_resampled = [yy[ir] for yy in y]
    
    return X_resampled, y_resampled

## 3. Addestramento reti

Creo delle funzioni che mi aiutino ad addestrare e valutare i diversi modelli:

In [0]:
def create_model_mtl(space: Dict[str, Any],
                     bptt: int,
                     n_features: int) -> keras.models.Model:
    """Create a model using the parameters in the search space."""
    l = space['layers']

    input_dropout = float(l['input_dropout'])
    assert input_dropout >= 0.0 and input_dropout <= 1.0

    n_layers = int(l['num_layers']['how_many'])
    assert n_layers <= 2 and n_layers > 0

#     n_cells_1 = int(l['num_layers']['n_cells_1'])
#     assert n_cells_1 >= 1

    # creo il modello
    model_input = keras.Input(shape=(bptt, n_features), name='model_input')

    if n_layers == 1:
        if input_dropout > 0.0:
            x = keras.layers.LSTM(n_features, dropout=input_dropout)(model_input)
        else:
            x = keras.layers.LSTM(n_features)(model_input)
    elif n_layers == 2:
        n_cells_2 = int(l['num_layers']['n_cells_2'])
        x = keras.layers.LSTM(n_features, return_sequences=True)(model_input)
        x = keras.layers.LSTM(n_cells_2)(x)
    elif n_layers == 3:
        n_cells_2 = int(l['num_layers']['n_cells_2'])
        n_cells_3 = int(l['num_layers']['n_cells_3'])
        x = keras.layers.LSTM(n_features, return_sequences=True)(model_input)
        x = keras.layers.LSTM(n_cells_2, return_sequences=True)(x)
        x = keras.layers.LSTM(n_cells_3)(x)

    output_is_extreme = keras.layers.Dense(
        2, activation='softmax', name='extreme')(x)
    output_is_up_down = keras.layers.Dense(
        2, activation='softmax', name='up_down')(x)

    model = keras.Model(
        inputs=model_input,
        outputs=[output_is_extreme, output_is_up_down],
        name='MTL_model')

    return model

Creo una funzione che crea ed addestra il modello che voglio:

In [0]:
def train_model(space,
                max_epochs: int,
                data: Union[pd.Series, pd.DataFrame],
                extremes: pd.Series,
                directions: pd.Series,
                verbose: int):
    """Train a model with the supplied parameters
    
    Parameters
    ----------
    space: 
        hyperopt search space

    max_epochs: int
        number of max epochs to tun the model for
        
    data: pd.Series of shape (n_timepoints,), or pd.DataFrame of shape (n_timepoints, n_features)
        data containing returns, volume and all other things, where every row is
        a timepoint and every column a different feature
        
    extremes: pd.Series of shape (n_timepoints,)
        target for the extremes, binary 1/0, 
        
    directions: pd.Series of shape (n_timepoints,)
        target for the directions
        
    verbose: int
        verbosity for Keras

    Returns
    -------
    model: keras.Model
        the trained model
    history: keras.history
        training history of the model
    
    X_train_bal: np.ndarray
        balanced training set
    
    X_validation: np.ndarray
        the validation set
        
    X_test: np.ndarray
        the test set
        
    y_train_bal: list of np.ndarray
        balanced targets for the training set
        
    y_validation: list of np.ndarray
        targets for the validation set
    
    y_test: list of np.ndarray
        targets for the test set
    """
    sigmoid_or_softmax = 'softmax'
    if data.ndim == 1:
        n_features = 1
    else:
        n_features = data.shape[1]

    lookback = bptt = int(space['bptt'])
    batch_size = data.shape[0]
    
    # 1. creazione dataset per questo lookback
    X_trv, X_test, y_trv, y_test, dates_trv, dates_test = create_Xyd(
        data.astype(np.float32),
        extremes.astype(np.float32),
        directions.astype(np.float32),
        lookback=lookback
    )

    # divido in train-validation
    X_train, X_validation, y_train, y_validation, dates_train, dates_validation = split_stratified(
        X_trv,
        y_trv,
        dates_trv,
        test_size=0.2,
        verbose=True
    )

    # 2. bilancio con oversampling della classe di minoranza (1)
    X_train_bal, y_train_bal = oversample_mtl(X_train, y_train)  # bal = balanced

    # 3. creo le variabili che servono per il training (dati e parametri)
    try:
        use_class_weight = space['use_class_weight']
        if use_class_weight:
            print("Using class weight for training")
    except KeyError:
        use_class_weight = False
        
    
    y_train_bal_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_train_bal]
    y_validation_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_validation]
    y_test_cat = [keras.utils.to_categorical(yy, num_classes=2) for yy in y_test]

    # 4. inizializza le loss a 0 e crea i tempi di inizio e l'id esperimento
    optimizer_name = space['optimizer']['name']
    assert optimizer_name in {'adam', 'adadelta'}

    start_time = int(round(time.time()))
    experiment_id = str(uuid.uuid4())

    # 5. addestra il modello
    model = create_model_mtl(space, lookback, n_features)

    # 5.1 crea l'optimizer
    if optimizer_name == 'adam':
        learning_rate = space['optimizer']['lr']
        optimizer = keras.optimizers.Adam(lr=learning_rate)
    elif optimizer_name == 'adadelta':
        optimizer = 'adadelta'
        learning_rate = 1.0
    else:
        raise ValueError(f"Invalid optimizer name {optimizer_name}")

    model.summary()

    # 5.2 compila il modello
    model.compile(
        optimizer=optimizer,
        loss=['categorical_crossentropy', 'categorical_crossentropy'],
    )

    # 5.3 parametri per l'Early Stopping
    min_delta = float(space['early_stop']['min_delta'])
    patience = int(space['early_stop']['patience'])

    early_stop_cb = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=min_delta,
        patience=patience,
        restore_best_weights=True)

    # 5.4 addestramento
    print("Fitting model")
    history: keras.callbacks.History = model.fit(  # type: ignore
        x=X_train_bal,
        y=y_train_bal_cat,
        epochs=max_epochs,
        batch_size=batch_size,
        validation_data=(X_validation, y_validation_cat),
        callbacks=[early_stop_cb],
        shuffle=True,
        verbose=verbose
    )

    return model, history, X_train_bal, X_validation, X_test, y_train_bal, y_validation, y_test

creo anche i dict dove salvare i risultati, mentre le funzioni di valutazione delle performance le metto alla fine, nel punto 4.

In [0]:
ann_probabilities = dict()
for imp_type in impact_type:
    ann_probabilities[imp_type] = dict()

best_spaces = copy.deepcopy(ann_probabilities)

y_true = copy.deepcopy(ann_probabilities)
recalls = copy.deepcopy(ann_probabilities)
precisions = copy.deepcopy(ann_probabilities)
fprs = copy.deepcopy(ann_probabilities)
ksss = copy.deepcopy(ann_probabilities)
losses = copy.deepcopy(ann_probabilities)
utilities = copy.deepcopy(ann_probabilities)

### 3.1 Azione con minima volatilità - NO sentiment - rete migliore

Cominciamo con l'azione meno volatile.

La migliore combinazione di iperparametri è:

- 19 neuroni di entrata
- optimizer: Adadelta
- early stopping con parametri:
    - patience: 14
    - min delta: 6e-4
- dropout in input con rate 0.07024542157284551
- lookback: 53

In [0]:
s_type = 'min_vol'
imp_type = 'no_sentiment'  # one of 'no_sentiment', 'with_sentiment'
model_path = os.path.join(base_path, 'results', 'models', f"final_model_{s_type}_{imp_type}.h5")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_{imp_type}.pickle")

with open(best_filename, 'rb') as infile:
    best = pickle.load(infile)

print(f"MODEL path: {model_path}")
print(f"BEST path: {best_filename}")
print(f"BEST: {best}")

best_spaces[imp_type][s_type] = {
    'bptt': best['bptt_len'],
    'early_stop': {
        'min_delta': best['early_stop_min_delta'],
        'patience': best['early_stop_patience']
    },
    'layers': {
        'input_dropout': best['dropout_kill_rate'],
        'num_layers': {
            'how_many': 1,
            'n_cells_1': 0,
            'n_cells_2': 0
        }
    },
    'optimizer': {
        'name': 'adadelta'
    },
    'use_class_weight': False
}

Ora creo i dati per questo esperimento

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for st_type, s_code in stock_codes.items():
    print(f"Stock type: {st_type}")
    print("-"*30)
    features[st_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[st_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[st_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
#    print("adding positive and negative sentiment impact")
#    features[st_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
#    features[st_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]   
    
    print("-" * 30)
    print("")


extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for st_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[st_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[st_type][name] for name in feature_names] #+ \
              #[features[st_type]['pos_impact'], features[st_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names #+ ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[st_type] = tmp_df
    extremes_all[st_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[st_type]['pos'][q_type],
        lr <= -thresholds[st_type]['neg'][q_type],
    )
    
    extremes_all[st_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[st_type] = direction

addestro quindi il modello

In [0]:
model, history, X_train_bal, X_validation, X_test, y_train_bal, y_validation, y_test = train_model(
    best_spaces[imp_type][s_type],
    MAX_EPOCHS,
    data_all[s_type],
    extremes_all[s_type][q_type],
    directions_all[s_type],
    verbose=2
)

print(f"\n\nSaving model to {model_path}")
model.save(model_path)

# salvataggio di quello che mi serve per il confronto in un file numpy
npz_path = os.path.join(base_path, 'datasets', f'final_{s_type}_{imp_type}.npz')
print(f"Saving data to {npz_path}")
np.savez(npz_path, **{
    'X_train': X_train_bal,
    'X_validation': X_validation,
    'X_test': X_test,
    'y_train': y_train_bal,
    'y_validation': y_validation,
    'y_test': y_test,
})

print("Adding to y_true")
# aggiunta a y_true e ann_probabilities
y_true[imp_type][s_type] = {
    'train': y_train_bal,
    'validation': y_validation,
    'test': y_test
}

print("Computing probabilities...")
ann_probabilities[imp_type][s_type] = {
    'train': model.predict(X_train_bal, batch_size=X_train_bal.shape[0]),
    'validation': model.predict(X_validation, batch_size=X_validation.shape[0]),
    'test': model.predict(X_test, batch_size=X_test.shape[0]),
}
print(f"Done for '{imp_type}' with stock '{s_type}'")

### 3.2 Azione con massima volatilità - NO sentiment - rete migliore

Cominciamo con l'azione meno volatile.

La migliore combinazione di iperparametri è:

- 19 neuroni di entrata
- optimizer: Adadelta
- early stopping con parametri:
    - patience: 19
    - min delta: 6e-4
- dropout in input con rate 0.08215272160724836
- lookback: 34

In [0]:
s_type = 'max_vol'
imp_type = 'no_sentiment'  # one of 'no_sentiment', 'with_sentiment'
model_path = os.path.join(base_path, 'results', 'models', f"final_model_{s_type}_{imp_type}.h5")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_{imp_type}.pickle")

with open(best_filename, 'rb') as infile:
    best = pickle.load(infile)

print(f"MODEL path: {model_path}")
print(f"BEST path: {best_filename}")
print(f"BEST: {best}")

best_spaces[imp_type][s_type] = {
    'bptt': best['bptt_len'],
    'early_stop': {
        'min_delta': best['early_stop_min_delta'],
        'patience': 30, #best['early_stop_patience']
    },
    'layers': {
        'input_dropout': best['dropout_kill_rate'],
        'num_layers': {
            'how_many': 1,
            'n_cells_1': 0,
            'n_cells_2': 0
        }
    },
    'optimizer': {
        'name': 'adadelta'
    },
    'use_class_weight': False
}

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for st_type, s_code in stock_codes.items():
    print(f"Stock type: {st_type}")
    print("-"*30)
    features[st_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[st_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[st_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
#    print("adding positive and negative sentiment impact")
#    features[st_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
#    features[st_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]   
    
    print("-" * 30)
    print("")


extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for st_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[st_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[st_type][name] for name in feature_names] #+ \
              #[features[st_type]['pos_impact'], features[st_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names #+ ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[st_type] = tmp_df
    extremes_all[st_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[st_type]['pos'][q_type],
        lr <= -thresholds[st_type]['neg'][q_type],
    )
    
    extremes_all[st_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[st_type] = direction

In [0]:
model, history, X_train_bal, X_validation, X_test, y_train_bal, y_validation, y_test = train_model(
    best_spaces[imp_type][s_type],
    MAX_EPOCHS,
    data_all[s_type],
    extremes_all[s_type][q_type],
    directions_all[s_type],
    verbose=2
)

print(f"\n\nSaving model to {model_path}")
model.save(model_path)

# salvataggio di quello che mi serve per il confronto in un file numpy
npz_path = os.path.join(base_path, 'datasets', f'final_{s_type}_{imp_type}.npz')
print(f"Saving data to {npz_path}")
np.savez(npz_path, **{
    'X_train': X_train_bal,
    'X_validation': X_validation,
    'X_test': X_test,
    'y_train': y_train_bal,
    'y_validation': y_validation,
    'y_test': y_test,
})

print("Adding to y_true")
# aggiunta a y_true e ann_probabilities
y_true[imp_type][s_type] = {
    'train': y_train_bal,
    'validation': y_validation,
    'test': y_test
}

print("Computing probabilities...")
ann_probabilities[imp_type][s_type] = {
    'train': model.predict(X_train_bal, batch_size=X_train_bal.shape[0]),
    'validation': model.predict(X_validation, batch_size=X_validation.shape[0]),
    'test': model.predict(X_test, batch_size=X_test.shape[0]),
}
print(f"Done for '{imp_type}' with stock '{s_type}'")

### 3.3 Azione con minima volatilità - WITH sentiment - rete migliore

Cominciamo con l'azione meno volatile.

La migliore combinazione di iperparametri è:

- 21 neuroni di entrata (19 + 2 sentiment)
- optimizer: Adam, con lr: 0.008646840787170844
- early stopping con parametri:
    - patience: 21
    - min delta: 0.0078000000000000005
- dropout in input con rate 0.031203212683827974
- lookback: 75

In [0]:
s_type = 'min_vol'
imp_type = 'with_sentiment'  # one of 'no_sentiment', 'with_sentiment'
model_path = os.path.join(base_path, 'results', 'models', f"final_model_{s_type}_{imp_type}.h5")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_{imp_type}_2_layers.pickle")

with open(best_filename, 'rb') as infile:
    best = pickle.load(infile)

print(f"MODEL path: {model_path}")
print(f"BEST path: {best_filename}")
print(f"BEST: {best}")

best_spaces[imp_type][s_type] = {
    'bptt': best['bptt_len'],
    'early_stop': {
        'min_delta': best['early_stop_min_delta'],
        'patience': best['early_stop_patience'],
    },
    'layers': {
        'input_dropout': best['dropout_kill_rate'],
        'num_layers': {
            'how_many': 2,
            'n_cells_1': 21,
            'n_cells_2': best['number_of_cells_2'],
        }
    },
    'optimizer': {
        'name': 'adadelta',
    },
    'use_class_weight': False
}

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for st_type, s_code in stock_codes.items():
    print(f"Stock type: {st_type}")
    print("-"*30)
    features[st_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[st_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[st_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
    print("adding positive and negative sentiment impact")
    features[st_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
    features[st_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]   
    
    print("-" * 30)
    print("")


extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for st_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[st_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[st_type][name] for name in feature_names] + \
              [features[st_type]['pos_impact'], features[st_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names + ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[st_type] = tmp_df
    extremes_all[st_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[st_type]['pos'][q_type],
        lr <= -thresholds[st_type]['neg'][q_type],
    )
    
    extremes_all[st_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[st_type] = direction

In [0]:
model, history, X_train_bal, X_validation, X_test, y_train_bal, y_validation, y_test = train_model(
    best_spaces[imp_type][s_type],
    MAX_EPOCHS,
    data_all[s_type],
    extremes_all[s_type][q_type],
    directions_all[s_type],
    verbose=2
)

print(f"\n\nSaving model to {model_path}")
model.save(model_path)

# salvataggio di quello che mi serve per il confronto in un file numpy
npz_path = os.path.join(base_path, 'datasets', f'final_{s_type}_{imp_type}.npz')
print(f"Saving data to {npz_path}")
np.savez(npz_path, **{
    'X_train': X_train_bal,
    'X_validation': X_validation,
    'X_test': X_test,
    'y_train': y_train_bal,
    'y_validation': y_validation,
    'y_test': y_test,
})

print("Adding to y_true")
# aggiunta a y_true e ann_probabilities
y_true[imp_type][s_type] = {
    'train': y_train_bal,
    'validation': y_validation,
    'test': y_test
}

print("Computing probabilities...")
ann_probabilities[imp_type][s_type] = {
    'train': model.predict(X_train_bal, batch_size=X_train_bal.shape[0]),
    'validation': model.predict(X_validation, batch_size=X_validation.shape[0]),
    'test': model.predict(X_test, batch_size=X_test.shape[0]),
}
print(f"Done for '{imp_type}' with stock '{s_type}'")

### 3.3 Azione con massima volatilità - WITH sentiment - rete migliore

Cominciamo con l'azione più volatile.

La migliore combinazione di iperparametri è:

- 21 neuroni di entrata (19 + 2 sentiment)
- optimizer: Adadelta
- early stopping con parametri:
    - patience: 23
    - min delta: 0.006
- dropout in input con rate 0.05756409954550673
- lookback: 11

In [0]:
s_type = 'max_vol'
imp_type = 'with_sentiment'  # one of 'no_sentiment', 'with_sentiment'
model_path = os.path.join(base_path, 'results', 'models', f"final_model_{s_type}_{imp_type}.h5")
best_filename = os.path.join(base_path, 'results', f"best_{s_type}_{imp_type}_2_layers.pickle")

with open(best_filename, 'rb') as infile:
    best = pickle.load(infile)

print(f"MODEL path: {model_path}")
print(f"BEST path: {best_filename}")
print(f"BEST: {best}")

best_spaces[imp_type][s_type] = {
    'bptt': best['bptt_len'],
    'early_stop': {
        'min_delta': best['early_stop_min_delta'],
        'patience': best['early_stop_patience'],
    },
    'layers': {
        'input_dropout': best['dropout_kill_rate'],
        'num_layers': {
            'how_many': 1,
            'n_cells_1': 0,
            'n_cells_2': 0
        }
    },
    'optimizer': {
        'name': 'adadelta',
    },
    'use_class_weight': False
}

In [0]:
feature_names = [
    'adx', 'aroon_down', 'aroon_up', 'atr', 'bb_lower', 'bb_middle', 'bb_upper',
    'cci', 'cmo', 'ema5', 'ema10', 'ema15', 'macd', 'rsi', 'sma5', 'sma10', 'sma15',
]

feature_paths = [os.path.join(ta_dir, name + '.h5') for name in feature_names]

features = dict()
first_allowable_dates = dict()  # date in cui posso prendere le feature e i returns

to_standardize = {
    'sma5', 'sma10', 'sma15',
    'ema5', 'ema10', 'ema15',
    'macd',
    'bb_lower', 'bb_middle', 'bb_upper',
    'roc', 'atr', 'cci', 'adx',
    }

to_divide = {
    'rsi': 100.0,
    'aroon_down': 100.0,
    'aroon_up': 100.0,
    'cmo': 100.0,
}

for st_type, s_code in stock_codes.items():
    print(f"Stock type: {st_type}")
    print("-"*30)
    features[st_type] = dict()

    for feature_name, feature_path in zip(feature_names, feature_paths):
        feature = pd.read_hdf(feature_path)

        if feature_name in to_standardize:
            print(f"standardizing {feature_name}")
            feature_transformed = standardize(feature.loc[:, s_code])
            features[st_type][feature_name] = feature_transformed
        elif feature_name in to_divide.keys():
            print(f"dividing {feature_name}")
            features[st_type][feature_name] = feature.loc[:, s_code] / to_divide[feature_name]
        else:
            raise ValueError(f"unknown feature {feature_name}")

    # impact positive e negative
    print("adding positive and negative sentiment impact")
    features[st_type]['pos_impact'] = impact_ts_no_nan.loc[:, ('PosImpact', s_code)]
    features[st_type]['neg_impact'] = impact_ts_no_nan.loc[:, ('NegImpact', s_code)]   
    
    print("-" * 30)
    print("")


extremes_all = dict()  # keys: s_type, q_type
data_all = dict()  # keys: s_type
volumes = dict()  # keys: s_type
directions_all = dict()  # keys: s_type

for st_type, s_code in stock_codes.items():
    # i returns
    lr = log_returns.loc[:, s_code]
    lr_transformed = standardize(lr)

    # i volumi
    stock_volume = volume_ts_no_nan.loc[:, s_code]
    volume_transformed = standardize(stock_volume)
    volumes[st_type] = volume_transformed

    # le features tecniche
    all_features = [lr_transformed, volume_transformed] + \
              [features[st_type][name] for name in feature_names] + \
              [features[st_type]['pos_impact'], features[st_type]['neg_impact']]

    # tutte le features in un unico DataFrame
    tmp_df = pd.concat(
        all_features,
        axis=1,
        keys=['log_return', 'volume'] + feature_names + ['pos_impact', 'neg_impact']
    )

    tmp_df = tmp_df.dropna(axis='index', how='any')
    
    data_all[st_type] = tmp_df
    extremes_all[st_type] = dict()
    
    ext = np.logical_or(
        lr >= thresholds[st_type]['pos'][q_type],
        lr <= -thresholds[st_type]['neg'][q_type],
    )
    
    extremes_all[st_type][q_type] = pd.Series(data=ext, index=log_returns.index)
    
    # le direzioni
    direction = (directions_ts_no_nan.loc[:, s_code] > 0.0).astype(np.int8)
    directions_all[st_type] = direction

In [0]:
model, history, X_train_bal, X_validation, X_test, y_train_bal, y_validation, y_test = train_model(
    best_spaces[imp_type][s_type],
    MAX_EPOCHS,
    data_all[s_type],
    extremes_all[s_type][q_type],
    directions_all[s_type],
    verbose=2
)

print(f"\n\nSaving model to {model_path}")
model.save(model_path)

# salvataggio di quello che mi serve per il confronto in un file numpy
npz_path = os.path.join(base_path, 'datasets', f'final_{s_type}_{imp_type}.npz')
print(f"Saving data to {npz_path}")
np.savez(npz_path, **{
    'X_train': X_train_bal,
    'X_validation': X_validation,
    'X_test': X_test,
    'y_train': y_train_bal,
    'y_validation': y_validation,
    'y_test': y_test,
})

print("Adding to y_true")
# aggiunta a y_true e ann_probabilities
y_true[imp_type][s_type] = {
    'train': y_train_bal,
    'validation': y_validation,
    'test': y_test
}

print("Computing probabilities...")
ann_probabilities[imp_type][s_type] = {
    'train': model.predict(X_train_bal, batch_size=X_train_bal.shape[0]),
    'validation': model.predict(X_validation, batch_size=X_validation.shape[0]),
    'test': model.predict(X_test, batch_size=X_test.shape[0]),
}
print(f"Done for '{imp_type}' with stock '{s_type}'")

## 4. Analisi dei risultati

Analizziamo allora i risultati per confrontarli con quelli delle probabilità.

In [0]:
def loss_function(theta, recall, fpr):
    """The loss function L = theta * (1 - recall) + (1 - theta) * fpr"""
    assert theta >= 0.0 and theta <= 1.0
    
    return theta * (1 - recall) + (1 - theta) * fpr


def utility_function(theta, loss):
    """The utility function U = min(theta, 1 - theta) - loss"""
    return min(theta, 1 - theta) - loss


def to_binary(prob: np.ndarray, thresh: float):
    assert thresh <= 1.0 and thresh >= 0.0
    
    return (prob >= thresh).astype(np.int8)


#def recall_fpr_kss_precision(y_true, y_pred):
#    """Compute recall, fpr and KSS score."""
#    tp = np.sum(np.logical_and(y_true, y_pred))
#    tn = np.sum(np.logical_and(
#        np.logical_not(y_true),
#        np.logical_not(y_pred)
#    ))
#    fp = np.sum(np.logical_and(
#        y_pred
#        np.logical_not(y_true),
#    ))
#    fn = np.sum(np.logical_and(
#        y_true,
#        np.logical_not(y_pred)
#    ))
#    
#    recall = tp / (tp + fn)  # TP / (TP + FN)
#    fpr = fp / (fp + tn)  # FP / (FP + TN)
#    precision = tp / (tp + fp)
#    
#    kss = recall - fpr
#    
#    return recall, fpr, kss, precision


def recall_fpr_kss_precision(y_true, y_pred):
    """Compute recall, fpr and KSS score."""
    cm = sm.confusion_matrix(y_true, y_pred, labels=[0, 1])
    tp = cm[1, 1]
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    
    recall = tp / (tp + fn)  # TP / (TP + FN)
    fpr = fp / (fp + tn)  # FP / (FP + TN)
    precision = tp / (tp + fp)
    
    kss = recall - fpr
    
    return recall, fpr, kss, precision

In [0]:
def optimize_wt(w, theta, probabilities, y_true, verbose=False):
    """Get the best threshold for the class 1 probability."""
    recalls = np.zeros((w.shape[0], ), dtype=np.float64)
    fprs = copy.deepcopy(recalls)
    ksss = copy.deepcopy(recalls)
    precisions = copy.deepcopy(recalls)
    losses = copy.deepcopy(recalls)
    utilities = copy.deepcopy(recalls)

    for i, thresh in enumerate(w):
        if i % 200 == 0 and verbose:
            print(f"iteration {i} / {len(w_t)}")

        y_pred = to_binary(probabilities, thresh).astype(np.int8)
        recall, fpr, kss, precision = recall_fpr_kss_precision(y_true, y_pred)
        loss = loss_function(theta, recall, fpr)
        utility = utility_function(theta, loss)

        recalls[i] = recall
        precisions[i] = precision
        ksss[i] = kss
        fprs[i] = fpr
        losses[i] = loss
        utilities[i] = utility

    if verbose:
        print("Finished!")

    return recalls, fprs, ksss, precisions, losses, utilities

In [0]:
# prima devo ottimizzare il w_t per ogni modello, e poi usarlo
dataset_type = ['train', 'validation', 'test']
w_t = np.arange(0, 1, 1e-3)
theta = 0.5

best_indexes = {
    imp_type: {
        s_type: dict()
        for s_type in stock_type
    }
    for imp_type in impact_type
}

# per ogni impact (sentiment Y/N)
for imp_type in impact_type:

    # per ogni tipo di stock (min_vol/max_vol)
    for s_type in stock_type:
        recalls[imp_type][s_type] = dict()
        fprs[imp_type][s_type] = dict()
        ksss[imp_type][s_type] = dict()
        precisions[imp_type][s_type] = dict()
        losses[imp_type][s_type] = dict()
        utilities[imp_type][s_type] = dict()
        best_indexes[imp_type][s_type] = dict()

        # per ogni dataset, ottimizzo sul VALIDATION set, è importante!
        for data_type in dataset_type:
            print(f"Impact: {imp_type}\tStock: {s_type}, dataset: {data_type}")

            probs = ann_probabilities[imp_type][s_type][data_type]
            curr_y_true = y_true[imp_type][s_type][data_type][0]

            if data_type in {'train', 'validation'}:  # solo su training e validation
                tmp_recalls, tmp_fprs, tmp_ksss, tmp_precisions, tmp_losses, tmp_utilities = \
                optimize_wt(w_t, theta, probs[0][:, 1], curr_y_true, verbose=False)

                recalls[imp_type][s_type][data_type] = tmp_recalls
                fprs[imp_type][s_type][data_type] = tmp_fprs
                ksss[imp_type][s_type][data_type] = tmp_ksss
                precisions[imp_type][s_type][data_type] = tmp_precisions
                losses[imp_type][s_type][data_type] = tmp_losses
                utilities[imp_type][s_type][data_type] = tmp_utilities

                best_indexes[imp_type][s_type][data_type] = np.argmax(utilities[imp_type][s_type][data_type])
            elif data_type == 'test':
                best_threshold = w_t[best_indexes[imp_type][s_type]['validation']]
                y_pred = to_binary(probs[0][:, 1], best_threshold)
                tmp_recall, tmp_fpr, tmp_kss, tmp_precision = recall_fpr_kss_precision(curr_y_true, y_pred)
                tmp_loss = loss_function(theta, tmp_recall, tmp_fpr)
                tmp_utility = utility_function(theta, tmp_loss)

                recalls[imp_type][s_type][data_type] = tmp_recall
                fprs[imp_type][s_type][data_type] = tmp_fpr
                ksss[imp_type][s_type][data_type] = tmp_kss
                precisions[imp_type][s_type][data_type] = tmp_precision
                losses[imp_type][s_type][data_type] = tmp_loss
                utilities[imp_type][s_type][data_type] = tmp_utility

per completezza calcolo anche il classification_report per ogni combinazione:

In [0]:
# per ogni impact (sentiment Y/N)
for imp_type in impact_type:

    # per ogni tipo di stock (min_vol/max_vol)
    for s_type in stock_type:

        # per ogni dataset, ottimizzo sul VALIDATION set, è importante!
        for data_type in dataset_type:
            print("-" * 60)
            print(f"\nImpact: {imp_type}\tStock: {s_type}, dataset: {data_type}\n")

            probs = ann_probabilities[imp_type][s_type][data_type]
            curr_y_true = y_true[imp_type][s_type][data_type][0]
            
            best_threshold = w_t[best_indexes[imp_type][s_type]['validation']]
            y_pred = to_binary(probs[0][:, 1], best_threshold)

            print(sm.classification_report(
                    curr_y_true,
                    y_pred,
                    labels=[0, 1]
                )
            )
            print("-" * 60)

già che ci sono calcolo la AUROC

In [0]:
def get_auroc(fpr, recall):
    area = scipy.integrate.trapz(y=recall, x=fpr)
    return area

In [0]:
fig, ax = pl.subplots(nrows=2, ncols=2, figsize=(20, 16))

for j, s_type in enumerate(stock_type):
    for i, imp_type in enumerate(impact_type):
        for data_type in dataset_type[:-1]:
            i_sorted = np.argsort(fprs[imp_type][s_type][data_type])
            
            x = fprs[imp_type][s_type][data_type][i_sorted]
            y = recalls[imp_type][s_type][data_type][i_sorted]

            area = get_auroc(x, y)
            row_format = "{:>15}{:>15}{:>15}{:>15.3f}"
            print(row_format.format(s_type, imp_type, data_type, area))
            #print(f"{s_type} | {imp_type} | {data_type} | AUROC: {area:4.3f}")

            ax[i, j].plot(
                x,
                y,
                color=dataset_colors[data_type],
                label=str.title(data_type)
            )

            i_sweet = np.argmax(utilities[imp_type][s_type]['validation'])
            best_x = fprs[imp_type][s_type][data_type][i_sweet]
            best_y = recalls[imp_type][s_type][data_type][i_sweet]

            ax[i, j].plot(
                best_x,
                best_y,
                marker='s',
                markersize=5,
                color=dataset_colors[data_type],
                label=str.title(f"{data_type} - best")
            )

        ax[i, j].plot([0, 1], [0, 1], color='black', linewidth=0.5, label='__None__')
        ax[i, j].legend(loc='lower right', fontsize=14)
        ax[i, j].set_xlim([0, 1.1])
        ax[i, j].set_ylim([0, 1.1])
        ax[i, j].set_xlabel('FPR', fontsize=16)
        ax[i, j].set_ylabel('Recall', fontsize=16)
        ax[i, j].set_title(f"{s_type} | {imp_type}", fontsize=16)

sns.despine()

In [0]:
fig, ax = pl.subplots(nrows=2, ncols=2, figsize=(20, 16))

for j, s_type in enumerate(stock_type):
    for i, imp_type in enumerate(impact_type):
        for data_type in dataset_type[:-1]:
            i_sorted = np.argsort(fprs[imp_type][s_type][data_type])
            
            x = w_t
            y = utilities[imp_type][s_type][data_type]

            ax[i, j].plot(
                x,
                y,
                color=dataset_colors[data_type],
                label=str.title(data_type)
            )

            i_sweet = np.argmax(utilities[imp_type][s_type]['validation'])
            best_x = w_t[i_sweet]
            best_y = utilities[imp_type][s_type][data_type][i_sweet]

            ax[i, j].plot(
                best_x,
                best_y,
                marker='s',
                markersize=5,
                color=dataset_colors[data_type],
                label=str.title(f"{data_type} - best")
            )

        ax[i, j].legend(loc='lower right', fontsize=14)
        #ax[i, j].set_xlim([0, 1.1])
        #ax[i, j].set_ylim([0, 0.21])
        ax[i, j].set_xlabel(r'$p_t$', fontsize=16)
        ax[i, j].set_ylabel('Utility', fontsize=16)
        ax[i, j].set_title(f"{s_type} | {imp_type}", fontsize=16)

sns.despine()

### 4.1 Creazione tabelle performance

Ora creo le tabelle esattamente come nel caso delle probabilità:

In [0]:
def get_single_results_table(recalls, precisions, fprs, ksss, utilities, s_type):
    columns = impact_type
    index = pd.Index([
        'in: FPR', 'out: FPR', 'in: Recall', 'out: Recall',
        'in: Prec', 'out: Prec',
        'in: U', 'out: U', 'in: KSS', 'out: KSS',
    ])
    table = pd.DataFrame(
        data=np.zeros((len(index), len(columns)), dtype=np.float64),
        columns=columns,
        index=index
    )

    for imp_type in columns:
        table.loc['in: FPR', imp_type] = fprs[imp_type][s_type]['train']
        table.loc['out: FPR', imp_type] = fprs[imp_type][s_type]['test']
        
        table.loc['in: Recall', imp_type] = recalls[imp_type][s_type]['train']
        table.loc['out: Recall', imp_type] = recalls[imp_type][s_type]['test']
        
        table.loc['in: Prec', imp_type] = precisions[imp_type][s_type]['train']
        table.loc['out: Prec', imp_type] = precisions[imp_type][s_type]['test']
        
        table.loc['in: U', imp_type] = utilities[imp_type][s_type]['train']
        table.loc['out: U', imp_type] = utilities[imp_type][s_type]['test']
        
        table.loc['in: KSS', imp_type] = ksss[imp_type][s_type]['train']
        table.loc['out: KSS', imp_type] = ksss[imp_type][s_type]['test']
    
    return table

In [0]:
all_recalls = {
    imp_type: {
        s_type: {
            data_type: 0
            for data_type in dataset_type
        }
        for s_type in stock_type
    }
    for imp_type in impact_type
}
all_precisions = copy.deepcopy(all_recalls)
all_fprs = copy.deepcopy(all_recalls)
all_ksss = copy.deepcopy(all_recalls)
all_utilities = copy.deepcopy(all_recalls)

for imp_type in impact_type:
    for s_type in stock_type:
        for data_type in dataset_type:

            if data_type == 'test':
                all_recalls[imp_type][s_type][data_type] = recalls[imp_type][s_type][data_type]
                all_precisions[imp_type][s_type][data_type] = precisions[imp_type][s_type][data_type]
                all_fprs[imp_type][s_type][data_type] = fprs[imp_type][s_type][data_type]
                all_ksss[imp_type][s_type][data_type] = ksss[imp_type][s_type][data_type]
                all_utilities[imp_type][s_type][data_type] = utilities[imp_type][s_type][data_type]
            else:
                i_best = best_indexes[imp_type][s_type][data_type]

                all_recalls[imp_type][s_type][data_type] = recalls[imp_type][s_type][data_type][i_best]
                all_precisions[imp_type][s_type][data_type] = precisions[imp_type][s_type][data_type][i_best]
                all_fprs[imp_type][s_type][data_type] = fprs[imp_type][s_type][data_type][i_best]
                all_ksss[imp_type][s_type][data_type] = ksss[imp_type][s_type][data_type][i_best]
                all_utilities[imp_type][s_type][data_type] = utilities[imp_type][s_type][data_type][i_best]

In [0]:
min_vol = get_single_results_table(all_recalls, all_precisions, all_fprs, all_ksss, all_utilities, 'min_vol')
max_vol = get_single_results_table(all_recalls, all_precisions, all_fprs, all_ksss, all_utilities, 'max_vol')

In [0]:
results = pd.concat([min_vol, max_vol], axis='columns', keys=['min_vol', 'max_vol'])

In [0]:
results.to_csv(os.path.join(base_path, 'confronto3.csv'))

In [0]:
results