In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import mean_squared_error
from utils.finance_metrics import annualized_return
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
data_path = r'data/processed'
data_filenames = os.listdir(data_path)

models = {}
feature_scalers = {}
target_scalers = {}
for data_file in data_filenames[5:]:
    pairs = data_file[:-4].split('-')
    pair = (pairs[0], pairs[1])

    pair_df = pd.read_pickle(os.path.join(data_path, data_file))
    X = pair_df.loc["2000-01-01":"2014-12-31", :].drop(columns="Return Diff (t+1)")
    y = pair_df.loc["2000-01-01":"2014-12-31", ["Return Diff (t+1)"]]
    X_test = pair_df.loc["2015-01-01":, :].drop(columns="Return Diff (t+1)")
    y_test = pair_df.loc["2015-01-01":, ["Return Diff (t+1)"]]

    # scale with QuantileTransformer and StandardScaler
    X_scaler = QuantileTransformer()
    X_scaler.fit(X); feature_scalers[pair] = X_scaler
    X_train = pd.DataFrame(X_scaler.transform(X), index=X.index, columns=X.columns)
    X_test = pd.DataFrame(X_scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    y_scaler = StandardScaler()
    y_scaler.fit(y); target_scalers[pair] = y_scaler
    y_train = pd.DataFrame(y_scaler.transform(y), index=y.index, columns=y.columns)
    y_test = pd.DataFrame(y_scaler.transform(y_test), index=y_test.index, columns=y_test.columns)
    assert False == any([any(arr) for arr in np.array(np.isinf(X_train))])
    assert False == any([any(arr) for arr in np.array(np.isnan(X_train))])
    assert False == any([any(arr) for arr in np.array(np.isinf(X_test))])
    assert False == any([any(arr) for arr in np.array(np.isnan(X_test))])
    assert False == any([any(arr) for arr in np.array(np.isinf(y_train))])
    assert False == any([any(arr) for arr in np.array(np.isnan(y_train))])
    assert False == any([any(arr) for arr in np.array(np.isinf(y_test))])
    assert False == any([any(arr) for arr in np.array(np.isnan(y_test))])

    split = PredefinedSplit(test_fold=[0 if v else -1 for v in X_train.index < '2012-01-01'])
    models[(pairs[0], pairs[1])] = {
        'Linear Regression': GridSearchCV(estimator=ElasticNet(max_iter=100000, tol=0.0004), param_grid={
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Support Vector Machine': GridSearchCV(estimator=SVR(cache_size=1000), param_grid={
            'C': [10**n for n in range(-3, 4)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Random Forest': GridSearchCV(estimator=RandomForestRegressor(bootstrap=True), param_grid={
            "n_estimators": [n for n in range(50, 150, 25)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Adaptive Boost': GridSearchCV(estimator=AdaBoostRegressor(), param_grid={
            "base_estimator": [DecisionTreeRegressor(max_depth=1), DecisionTreeRegressor(max_depth=3)],
            "n_estimators": [n for n in range(50, 250, 50)],
            "loss": ["linear", "exponential"]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Gradient Boost': GridSearchCV(estimator=GradientBoostingRegressor(loss="huber"), param_grid={
            "n_estimators": [n for n in range(50, 250, 50)]
        }, cv=split, scoring="explained_variance", n_jobs=-1),
        'Neural Net': GridSearchCV(estimator=MLPRegressor(solver="lbfgs", max_iter=1000000), param_grid={
            "hidden_layer_sizes": [(h1, h2)
                                   for h1 in range(100, 200, 50)
                                   for h2 in range(25, h1//2, 25)]
        }, cv=split, scoring="explained_variance", n_jobs=-1)
    }

    print(f"==================================================\n"
          f"Results for pair {pair}:")
    pred_average = None
    for model_type, model in models[pair].items():
        model.fit(X_train, np.array(y_train).ravel())
        pred = model.predict(X_test)
        if model_type != "Linear Regression":
            if pred_average is None: pred_average = pred / 5
            else: pred_average = pred_average + np.array(pred / 5)
        mse = mean_squared_error(np.array(y_test), np.array(pred).ravel())
        print(f"- Model: {model_type}\n"
              f"  - best parameters: {model.best_params_}\n"
              f"  - MSE: {mse:.06f}\n"
              f"--------------------------------------------------")
    mse = mean_squared_error(np.array(y_test), np.array(pred_average).ravel())
    print(f"- Ensemble of all models\n"
          f"  - MSE: {mse:.06f}\n"
          f"--------------------------------------------------")
    print(f"==================================================\n")



Results for pair ('ASA', 'AU'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 0.842752
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.113275
--------------------------------------------------
- Model: Random Forest
  - best parameters: {'n_estimators': 50}
  - MSE: 0.001623
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'linear', 'n_estimators': 200}
  - MSE: 0.021092
--------------------------------------------------
- Model: Gradient Boost
  - best parameters: {'n_estimators': 150}
  - MSE: 0.004493
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (150, 25)}
  - MSE: 0.132448
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.008264
------------------------------------

- Model: Random Forest
  - best parameters: {'n_estimators': 50}
  - MSE: 0.013485
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'linear', 'n_estimators': 100}
  - MSE: 0.030009
--------------------------------------------------
- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.003183
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (100, 25)}
  - MSE: 0.188847
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.050406
--------------------------------------------------

Results for pair ('CCL', 'RCL'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 0.682403
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 100}
  - MSE: 0.396505
---------------------------------

- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.000485
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (150, 25)}
  - MSE: 0.009727
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.007922
--------------------------------------------------

Results for pair ('CVX', 'XOM'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 1.455035
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.660190
--------------------------------------------------
- Model: Random Forest
  - best parameters: {'n_estimators': 125}
  - MSE: 0.145256
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'exponential', 'n_estimators': 100}
  - MSE: 0.152538
----------------------------

Results for pair ('FRT', 'REG'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 0.360837
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.046742
--------------------------------------------------
- Model: Random Forest
  - best parameters: {'n_estimators': 50}
  - MSE: 0.000039
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'linear', 'n_estimators': 100}
  - MSE: 0.014450
--------------------------------------------------
- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.000141
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (150, 50)}
  - MSE: 0.010664
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.003051
-----------------------------------

- Model: Random Forest
  - best parameters: {'n_estimators': 100}
  - MSE: 0.035008
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'exponential', 'n_estimators': 50}
  - MSE: 0.046106
--------------------------------------------------
- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.024195
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (100, 25)}
  - MSE: 0.063698
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.050533
--------------------------------------------------

Results for pair ('HP', 'PTEN'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 1.515577
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.634391
-----------------------------

- Model: Gradient Boost
  - best parameters: {'n_estimators': 200}
  - MSE: 0.001031
--------------------------------------------------
- Model: Neural Net
  - best parameters: {'hidden_layer_sizes': (150, 50)}
  - MSE: 0.011398
--------------------------------------------------
- Ensemble of all models
  - MSE: 0.010567
--------------------------------------------------

Results for pair ('MAC', 'SPG'):
- Model: Linear Regression
  - best parameters: {'l1_ratio': 0.1}
  - MSE: 1.169207
--------------------------------------------------
- Model: Support Vector Machine
  - best parameters: {'C': 10}
  - MSE: 0.561769
--------------------------------------------------
- Model: Random Forest
  - best parameters: {'n_estimators': 50}
  - MSE: 0.011460
--------------------------------------------------
- Model: Adaptive Boost
  - best parameters: {'base_estimator': DecisionTreeRegressor(max_depth=3), 'loss': 'linear', 'n_estimators': 200}
  - MSE: 0.045851
----------------------------------

In [5]:
data_filenames

['ADX-TY.zip',
 'AEG-ING.zip',
 'AMAT-KLAC.zip',
 'APA-DVN.zip',
 'ARW-AVT.zip',
 'ASA-AU.zip',
 'AVB-EQR.zip',
 'BAC-WFC.zip',
 'BBVA-SAN.zip',
 'BEN-TROW.zip',
 'BK-NTRS.zip',
 'BMO-RY.zip',
 'BXP-VNO.zip',
 'CCL-RCL.zip',
 'CM-TD.zip',
 'CMA-TFC.zip',
 'COP-PEO.zip',
 'CPT-UDR.zip',
 'CSX-NSC.zip',
 'CUZ-WRE.zip',
 'CVX-XOM.zip',
 'DHI-LEN.zip',
 'DRE-PLD.zip',
 'E-TOT.zip',
 'ED-SO.zip',
 'ELS-MAA.zip',
 'FITB-RF.zip',
 'FRT-REG.zip',
 'FULT-VLY.zip',
 'GAM-USA.zip',
 'GFI-HMY.zip',
 'GOLD-NEM.zip',
 'HAL-SLB.zip',
 'HES-OXY.zip',
 'HIW-KRC.zip',
 'HP-PTEN.zip',
 'HQH-HQL.zip',
 'IAC-MTCH.zip',
 'IFN-IIF.zip',
 'KBH-PHM.zip',
 'KIM-WRI.zip',
 'LSI-PSA.zip',
 'MAC-SPG.zip',
 'MRO-MUR.zip',
 'NNN-O.zip',
 'PEAK-WELL.zip',
 'PNC-USB.zip',
 'RMT-RVT.zip',
 'TDS-USM.zip']