In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.svm import SVR

In [2]:
data_path = r'data/processed'
data_filenames = os.listdir(data_path)

models = {}
feature_transformers = {}
feature_scalers = {}
target_scalers = {}
for data_file in data_filenames:
    pairs = data_file[:-4].split('-')
    pair = (pairs[0], pairs[1])
    print(pair)

    pair_df = pd.read_pickle(os.path.join(data_path, data_file))
    X_train = pair_df.loc["2000-01-01":"2014-12-31", :].drop(columns="Return Diff (t+1)")
    y_train = pair_df.loc["2000-01-01":"2014-12-31", ["Return Diff (t+1)"]]
    X_test = pair_df.loc["2015-01-01":, :].drop(columns="Return Diff (t+1)")
    y_test = pair_df.loc["2015-01-01":, ["Return Diff (t+1)"]]

    # fill invalid values with 1 (for ratios) or 0 (for differences)
    X_train[X_train.isin([-np.inf, np.inf, np.nan])] = 1
    y_train[y_train.isin([-np.inf, np.inf, np.nan])] = 0
    X_test[X_test.isin([-np.inf, np.inf, np.nan])] = 1
    y_test[y_test.isin([-np.inf, np.inf, np.nan])] = 0

    # add in quantiles as additional feature columns
    quantile_transformer = QuantileTransformer()
    quantile_transformer.fit(X_train); feature_transformers[pair] = quantile_transformer
    X_train.loc[:, [col + "_QUANTILE" for col in X_train.columns]] = pd.DataFrame(
        pd.DataFrame(quantile_transformer.transform(X_train), index=X_train.index,
                     columns=[col + "_QUANTILE" for col in X_train.columns]))
    X_test.loc[:, [col + "_QUANTILE" for col in X_test.columns]] = pd.DataFrame(
        pd.DataFrame(quantile_transformer.transform(X_test), index=X_test.index,
                     columns=[col + "_QUANTILE" for col in X_test.columns]))

    # fill invalid values with 1 (for ratios) or 0 (for differences)
    X_train[X_train.isin([-np.inf, np.inf, np.nan])] = 1
    y_train[y_train.isin([-np.inf, np.inf, np.nan])] = 0
    X_test[X_test.isin([-np.inf, np.inf, np.nan])] = 1
    y_test[y_test.isin([-np.inf, np.inf, np.nan])] = 0

    # scale features and target column
    X_scaler = StandardScaler()
    X_scaler.fit(X_train); feature_scalers[pair] = X_scaler
    X_train = pd.DataFrame(X_scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(X_scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    y_scaler = StandardScaler()
    y_scaler.fit(y_train); target_scalers[pair] = y_scaler
    y_train = pd.DataFrame(y_scaler.transform(y_train), index=y_train.index, columns=y_train.columns)
    y_test = pd.DataFrame(y_scaler.transform(y_test), index=y_test.index, columns=y_test.columns)
    assert not any([any(arr) for arr in np.array(np.isinf(X_train))])
    assert not any([any(arr) for arr in np.array(np.isnan(X_train))])
    assert not any([any(arr) for arr in np.array(np.isinf(X_test))])
    assert not any([any(arr) for arr in np.array(np.isnan(X_test))])
    assert not any([any(arr) for arr in np.array(np.isinf(y_train))])
    assert not any([any(arr) for arr in np.array(np.isnan(y_train))])
    assert not any([any(arr) for arr in np.array(np.isinf(y_test))])
    assert not any([any(arr) for arr in np.array(np.isnan(y_test))])

    split = PredefinedSplit(test_fold=[0 if v else -1 for v in X_train.index < '2012-01-01'])
    models[pair] = {
        'Linear Regression': GridSearchCV(estimator=ElasticNet(max_iter=100000, tol=0.004), param_grid={
            'l1_ratio': [0.4, 0.5, 0.6]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Support Vector Machine': GridSearchCV(estimator=SVR(cache_size=1000), param_grid={
            'C': [10**n for n in range(-3, 4)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Random Forest': GridSearchCV(estimator=RandomForestRegressor(bootstrap=True), param_grid={
            "n_estimators": [n for n in range(50, 150, 25)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Adaptive Boost': GridSearchCV(estimator=AdaBoostRegressor(), param_grid={
            "n_estimators": [n for n in range(50, 250, 50)],
            "loss": ["linear", "exponential"]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Gradient Boost': GridSearchCV(estimator=GradientBoostingRegressor(loss="huber"), param_grid={
            "n_estimators": [n for n in range(50, 250, 50)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Neural Net': GridSearchCV(estimator=MLPRegressor(solver="lbfgs", max_iter=1000000), param_grid={
            "hidden_layer_sizes": [(h1, h2)
                                   for h1 in range(100, 200, 50)
                                   for h2 in range(25, h1//2, 25)]
        }, cv=split, scoring="explained_variance", n_jobs=-1)
    }

    print(f"==================================================\n"
          f"Results for pair {pair}:")
    pred_average = None
    for model_type, model in models[pair].items():
        model.fit(X_train, np.array(y_train).ravel())
        pred = model.predict(X_test)
        if model_type != "Linear Regression":
            if pred_average is None: pred_average = pred / 5
            else: pred_average = pred_average + np.array(pred / 5)
        mse = mean_squared_error(np.array(y_test), np.array(pred).ravel())
        print(f"- Model: {model_type}\n"
              f"  - best parameters: {model.best_params_}\n"
              f"  - MSE: {mse:.06f}\n"
              f"--------------------------------------------------")
    mse = mean_squared_error(np.array(y_test), np.array(pred_average).ravel())
    print(f"- Ensemble of all models\n"
          f"  - MSE: {mse:.06f}\n"
          f"--------------------------------------------------")
    print(f"==================================================\n")

('BAC', 'WFC')
Results for pair ('BAC', 'WFC'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.4}
  - MSE: 0.093297
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.017372
--------------------------------------------------
- Model: Random Forest
  - best parameters: {'n_estimators': 125}
  - MSE: 0.000032
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'loss': 'linear', 'n_estimators': 150}
  - MSE: 0.027660
--------------------------------------------------
- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.000164
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (150, 25)}
  - MSE: 0.000282
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.002173
--------------------------------------------------

('HP', 'PTEN')
Result

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_mat_all = None
for data_file in data_filenames:
    pairs = data_file[:-4].split('-')
    pair = (pairs[0], pairs[1])
    print(pair)

    pair_df = pd.read_pickle(os.path.join(data_path, data_file))
    X_train = pair_df.loc["2000-01-01":"2014-12-31", :].drop(columns="Return Diff (t+1)")
    y_train = pair_df.loc["2000-01-01":"2014-12-31", ["Return Diff (t+1)"]]
    X_test = pair_df.loc["2015-01-01":, :].drop(columns="Return Diff (t+1)")
    y_test = pair_df.loc["2015-01-01":, ["Return Diff (t+1)"]]

    # fill invalid values with 1 (for ratios) or 0 (for differences)
    X_train[X_train.isin([-np.inf, np.inf, np.nan])] = 1
    y_train[y_train.isin([-np.inf, np.inf, np.nan])] = 0
    X_test[X_test.isin([-np.inf, np.inf, np.nan])] = 1
    y_test[y_test.isin([-np.inf, np.inf, np.nan])] = 0

    # add in quantiles as additional feature columns
    quantile_transformer = feature_transformers[pair]
    X_train.loc[:, [col + "_QUANTILE" for col in X_train.columns]] = pd.DataFrame(
        pd.DataFrame(quantile_transformer.transform(X_train), index=X_train.index,
                     columns=[col + "_QUANTILE" for col in X_train.columns]))
    X_test.loc[:, [col + "_QUANTILE" for col in X_test.columns]] = pd.DataFrame(
        pd.DataFrame(quantile_transformer.transform(X_test), index=X_test.index,
                     columns=[col + "_QUANTILE" for col in X_test.columns]))

    # fill invalid values with 1 (for ratios) or 0 (for differences)
    X_train[X_train.isin([-np.inf, np.inf, np.nan])] = 1
    y_train[y_train.isin([-np.inf, np.inf, np.nan])] = 0
    X_test[X_test.isin([-np.inf, np.inf, np.nan])] = 1
    y_test[y_test.isin([-np.inf, np.inf, np.nan])] = 0

    # scale features and target column
    X_scaler = feature_scalers[pair]
    X_train = pd.DataFrame(X_scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(X_scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    y_scaler = target_scalers[pair]
    y_train = pd.DataFrame(y_scaler.transform(y_train), index=y_train.index, columns=y_train.columns)
    y_test = pd.DataFrame(y_scaler.transform(y_test), index=y_test.index, columns=y_test.columns)
    assert not any([any(arr) for arr in np.array(np.isinf(X_train))])
    assert not any([any(arr) for arr in np.array(np.isnan(X_train))])
    assert not any([any(arr) for arr in np.array(np.isinf(X_test))])
    assert not any([any(arr) for arr in np.array(np.isnan(X_test))])
    assert not any([any(arr) for arr in np.array(np.isinf(y_train))])
    assert not any([any(arr) for arr in np.array(np.isnan(y_train))])
    assert not any([any(arr) for arr in np.array(np.isinf(y_test))])
    assert not any([any(arr) for arr in np.array(np.isnan(y_test))])

    print(f"==================================================\n"
          f"Results for pair {pair}:")
    pred_ensemble = None
    for model_type, model in models[pair].items():
        pred = [-1 if i < 0 else 1 for i in np.array(model.predict(X_test)).ravel()]
        if any(model_type == t for t in ["Random Forest", "Adaptive Boost", "Gradient Boost", "Neural Net"]):
            if pred_ensemble is None: pred_ensemble = np.array(pred) / 4
            else: pred_ensemble = pred_ensemble + (np.array(pred) / 4)

        conf_mat = confusion_matrix(
            [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
            [0 if i < 0 else 1 for i in np.array(pred).ravel()])
        accuracy = accuracy_score(
            [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
            [0 if i < 0 else 1 for i in np.array(pred).ravel()])
        precision = precision_score(
            [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
            [0 if i < 0 else 1 for i in np.array(pred).ravel()])
        recall = recall_score(
            [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
            [0 if i < 0 else 1 for i in np.array(pred).ravel()])
        f1 = f1_score(
            [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
            [0 if i < 0 else 1 for i in np.array(pred).ravel()])

        if any([pair == p for p in [('NNN', 'O'), ('BK', 'NTRS'), ('GFI', 'HMY')]]):
            print(f"- Model: {model_type}\n"
                  f"  - Confusion Matrix:\n{conf_mat}\n"
                  f"  - Accuracy: {accuracy:.06f}\n"
                  f"  - Recall: {recall:.06f}\n"
                  f"  - Precision: {precision:.06f}\n"
                  f"  - F1-Score: {f1:.06f}\n"
                  f"--------------------------------------------------")

    conf_mat = confusion_matrix(
        [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
        [0 if i < 0 else 1 for i in np.array(pred_ensemble).ravel()])
    accuracy = accuracy_score(
        [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
        [0 if i < 0 else 1 for i in np.array(pred_ensemble).ravel()])
    precision = precision_score(
        [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
        [0 if i < 0 else 1 for i in np.array(pred_ensemble).ravel()])
    recall = recall_score(
        [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
        [0 if i < 0 else 1 for i in np.array(pred_ensemble).ravel()])
    f1 = f1_score(
        [0 if i < 0 else 1 for i in y_test.to_numpy().ravel()],
        [0 if i < 0 else 1 for i in np.array(pred_ensemble).ravel()])

    if conf_mat_all is None: conf_mat_all = np.array(conf_mat)
    else: conf_mat_all = conf_mat_all + np.array(conf_mat)

    print(f"- Ensemble of Random Forest, Adaptive Boost, Gradient Boost, and Neural Net\n"
          f"  - Confusion Matrix:\n{conf_mat}\n"
          f"  - Accuracy: {accuracy:.06f}\n"
          f"  - Recall: {recall:.06f}\n"
          f"  - Precision: {precision:.06f}\n"
          f"  - F1-Score: {f1:.06f}\n"
          f"--------------------------------------------------")
    print(f"==================================================\n")


print(f"- Confusion Matrix of ensemble prediction across all pairs\n{conf_mat_all}\n"
      f"  - Accuracy: {(conf_mat_all[0][0]+conf_mat_all[1][1]) / np.sum(conf_mat_all):.06f}\n"
      f"--------------------------------------------------")
print(f"==================================================\n")


('BAC', 'WFC')
Results for pair ('BAC', 'WFC'):
- Ensemble of Random Forest, Adaptive Boost, Gradient Boost, and Neural Net
  - Confusion Matrix:
[[709   1]
 [  3 837]]
  - Accuracy: 0.997419
  - Recall: 0.996429
  - Precision: 0.998807
  - F1-Score: 0.997616
--------------------------------------------------

('HP', 'PTEN')
Results for pair ('HP', 'PTEN'):
- Ensemble of Random Forest, Adaptive Boost, Gradient Boost, and Neural Net
  - Confusion Matrix:
[[790   2]
 [  0 758]]
  - Accuracy: 0.998710
  - Recall: 1.000000
  - Precision: 0.997368
  - F1-Score: 0.998682
--------------------------------------------------

('AEG', 'ING')
Results for pair ('AEG', 'ING'):
- Ensemble of Random Forest, Adaptive Boost, Gradient Boost, and Neural Net
  - Confusion Matrix:
[[761   4]
 [  0 785]]
  - Accuracy: 0.997419
  - Recall: 1.000000
  - Precision: 0.994930
  - F1-Score: 0.997459
--------------------------------------------------

('TDS', 'USM')
Results for pair ('TDS', 'USM'):
- Ensemble of Ra

  _warn_prf(average, modifier, msg_start, len(result))
