In [346]:
import helper, consts
import importlib
importlib.reload(consts)
importlib.reload(helper)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn, scipy, requests

In [347]:
# USER
USER = "HOANG"
FILE_PATH = consts.PATH_MAP[USER]

ROW = consts.ROW
COL = consts.COL
CLEANED_DATA_PATH = consts.CLEANED_DATA_PATH
DATA_PATH = consts.RAW_DATA_PATH
RESPONSE_NAME = consts.RESPONSE_NAME

TRAIN_START_DATE = "20150101"
TRAIN_END_DATE = "20150601" # Up to but not including
TEST_START_DATE = "20150701"
TEST_END_DATE = "20150801"

REGRESSION_TYPES = helper.Regression('OLS').list_all_regression_types()
REGRESSION_TYPES

1: OLS
2: LASSO
3: XGBOOST


### Read the data

In [348]:
train_start, train_end  = helper.get_train_from_testday(TEST_START_DATE)
list_of_interacting_terms = [["relvol_nt_0","rrirpnxm_nt_0"], 
                            ["relvol_lst15_0", "rrirpnxm_lst15_0"],
                            ["relvol_lsthrx15_0", "rrirpnxm_lsthrx15_0"],
                            ["relvol_toxhr_0", "rrirpnxm_toxhr_0"]]

FEATURE_COL_NAMES = ["rrirpnxm_nt_0", "rrirpnxm_lst15_0", "rrirpnxm_lsthrx15_0", "rrirpnxm_toxhr_0"]

In [349]:
file_names = helper.get_file_names(TEST_START_DATE, TEST_END_DATE, FILE_PATH)

In [350]:
dfs = []
for file_name in file_names:
    df = pd.read_csv(FILE_PATH + file_name)
    df, new_col_names = helper.get_df_with_interaction_terms(df, list_of_interacting_terms)
    stayed_cols = np.append(FEATURE_COL_NAMES, new_col_names)
    stayed_cols = np.append(stayed_cols, consts.RESPONSE_NAME)
    dfs.append(df[stayed_cols])

In [351]:
predicted_y_list = []
actual_y_list = []

In [352]:
from sklearn.model_selection import train_test_split
for df in dfs:
    has_intercept_df = helper.append_columnOf_ones(df)

    y = has_intercept_df[consts.RESPONSE_NAME]
    X = has_intercept_df.drop(consts.RESPONSE_NAME, inplace=False, axis=consts.COL)
    train_X, test_X, train_y, test_y = train_test_split(X, y, train_size = .8, random_state = 42)
    actual_y_list.append(test_y)
    
    from sklearn import linear_model
    from xgboost import XGBRegressor
    # model = linear_model.LinearRegression(fit_intercept=False)
    model = XGBRegressor()
    model.fit(X=train_X, y=train_y)
    predicted_y = model.predict(test_X)
    
    predicted_y_list.append(predicted_y)
    
    assert len(test_y) == len(predicted_y)

In [353]:
def get_response_corrs(predicted_y, actual_y):
    return np.corrcoef(predicted_y, actual_y)[0, 1]

def get_mean_returns(predicted_y, actual_y):
    return np.sum((np.abs(actual_y) / len(predicted_y)) * \
                  (np.sign(actual_y) * np.sign(predicted_y)))

def get_scale_factors(predicted_y, actual_y):
    from sklearn import linear_model
    
    model = linear_model.LinearRegression(fit_intercept=False)
    model.fit(X=pd.DataFrame({"predicted_y": predicted_y}), y=actual_y)
    return model.coef_

def get_metric(predicted_y_list, actual_y_list):
    """Print metrics defind by Scott

    Returns: [weighted_corr, mean_return, scale_factor]
    """
    assert len(predicted_y_list) == len(actual_y_list), print(f"length(predicted_y_list) = {len(predicted_y_list)}, length(actual_y_list) = {len(actual_y_list)}")
    
    weighted_corrs, weighted_mean_returns, weighted_scale_factors = [], [], []
    for i in range(len(actual_y_list)):
        predicted_y, actual_y = predicted_y_list[i], actual_y_list[i]
        
        weighted_corrs.append(get_response_corrs(predicted_y, actual_y))
        weighted_mean_returns.append(get_mean_returns(predicted_y, actual_y))
        weighted_scale_factors.append(get_scale_factors(predicted_y, actual_y))
    
    print(f"1. Weighted Correlation: {np.mean(weighted_corrs)}")
    print(f"2. Weighted Mean Return: {np.mean(weighted_mean_returns)}")
    print(f"3. Weighted Scale Factor: {np.mean(weighted_scale_factors)}")
    
    return

In [354]:
get_metric(predicted_y_list, actual_y_list)

1. Weighted Correlation: 0.04051417565014369
2. Weighted Mean Return: 0.0002521948540185602
3. Weighted Scale Factor: 0.06893561780452728


In [355]:
dfs[0].columns

Index(['rrirpnxm_nt_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0',
       'rrirpnxm_toxhr_0', '('relvol_nt_0', 'rrirpnxm_nt_0')',
       '('relvol_lst15_0', 'rrirpnxm_lst15_0')',
       '('relvol_lsthrx15_0', 'rrirpnxm_lsthrx15_0')',
       '('relvol_toxhr_0', 'rrirpnxm_toxhr_0')', 'tonight'],
      dtype='object')