### Imports

In [1]:
%pip install -r dependencies.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import requests
import scipy
import importlib
import helper, consts
importlib.reload(consts)
importlib.reload(helper)

<module 'helper' from '/Users/rosy/Clinic/Clinic/code/helper.py'>

### CONSTANTS

In [3]:
ROW = consts.ROW
COL = consts.COL
CLEANED_DATA_PATH = consts.CLEANED_DATA_PATH
DATA_PATH = consts.RAW_DATA_PATH
RESPONSE_NAME = consts.RESPONSE_NAME

# TRAIN_START_DATE = "20150101"
# TRAIN_END_DATE = "20150601" # Up to but not including
TEST_DATE = '20170103'

In [4]:
trainRange = helper.get_train_from_testday(TEST_DATE)
training_range = f"{trainRange[0]}-{trainRange[1]}"
train_start_date = trainRange[0]
train_end_date = trainRange[1]

In [5]:
REGRESSION_TYPES = helper.Regression('OLS').list_all_regression_types()
REGRESSION_TYPES

1: OLS
2: LASSO
3: XGBOOST


### Read data

In [6]:
importlib.reload(helper)
x_cols = ["rrirpnxm_nt_0", "rrirpnxm_lst15_0","rrirpnxm_lsthrx15_0", "rrirpnxm_toxhr_0"]
relvol_cols = ["relvol_nt_0", "relvol_lst15_0", "relvol_lsthrx15_0", "relvol_toxhr_0"]
x_cols += relvol_cols
train_df, test_df = helper.get_train_test_df(train_start_date, train_end_date, TEST_DATE, x_cols)
train_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 758878 entries, 0 to 2978
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   rrirpnxm_nt_0        758878 non-null  float64
 1   rrirpnxm_lst15_0     758878 non-null  float64
 2   rrirpnxm_lsthrx15_0  758878 non-null  float64
 3   rrirpnxm_toxhr_0     758878 non-null  float64
 4   relvol_nt_0          758878 non-null  float64
 5   relvol_lst15_0       758878 non-null  float64
 6   relvol_lsthrx15_0    758878 non-null  float64
 7   relvol_toxhr_0       758878 non-null  float64
 8   tonight              758878 non-null  float64
dtypes: float64(9)
memory usage: 57.9 MB


### Interaction Terms

In [7]:
importlib.reload(helper)
interactingTerms = [[relvol, col] for relvol,col in zip(relvol_cols,x_cols)]
interactingTerms

[['relvol_nt_0', 'rrirpnxm_nt_0'],
 ['relvol_lst15_0', 'rrirpnxm_lst15_0'],
 ['relvol_lsthrx15_0', 'rrirpnxm_lsthrx15_0'],
 ['relvol_toxhr_0', 'rrirpnxm_toxhr_0']]

In [8]:
train_df.columns

Index(['rrirpnxm_nt_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0',
       'rrirpnxm_toxhr_0', 'relvol_nt_0', 'relvol_lst15_0',
       'relvol_lsthrx15_0', 'relvol_toxhr_0', 'tonight'],
      dtype='object')

In [9]:
importlib.reload(helper)
interactingTerms_df = helper.get_df_with_interaction_terms(train_df, interactingTerms)

In [10]:
interactingTerms_df.columns

Index(['rrirpnxm_nt_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0',
       'rrirpnxm_toxhr_0', 'relvol_nt_0', 'relvol_lst15_0',
       'relvol_lsthrx15_0', 'relvol_toxhr_0', 'tonight',
       '('relvol_nt_0', 'rrirpnxm_nt_0')',
       '('relvol_lst15_0', 'rrirpnxm_lst15_0')',
       '('relvol_lsthrx15_0', 'rrirpnxm_lsthrx15_0')',
       '('relvol_toxhr_0', 'rrirpnxm_toxhr_0')'],
      dtype='object')

In [11]:
interactingTerms_df.head()

Unnamed: 0,rrirpnxm_nt_0,rrirpnxm_lst15_0,rrirpnxm_lsthrx15_0,rrirpnxm_toxhr_0,relvol_nt_0,relvol_lst15_0,relvol_lsthrx15_0,relvol_toxhr_0,tonight,"('relvol_nt_0', 'rrirpnxm_nt_0')","('relvol_lst15_0', 'rrirpnxm_lst15_0')","('relvol_lsthrx15_0', 'rrirpnxm_lsthrx15_0')","('relvol_toxhr_0', 'rrirpnxm_toxhr_0')"
0,0.011807,0.000872,-0.003254,0.003411,-0.146064,0.364969,-0.687345,-0.480458,0.018082,-0.001725,0.000318,0.002237,-0.001639
1,0.002462,0.001497,-0.002905,0.007703,0.371322,0.071521,-0.217405,0.254765,0.005392,0.000914,0.000107,0.000632,0.001962
2,-0.007588,-0.001964,0.008323,-0.001865,-0.825926,-0.808777,-1.799015,-2.003263,-0.003484,0.006267,0.001588,-0.014973,0.003736
3,0.001151,-0.001417,-0.002717,0.00205,-0.997409,-1.053669,-1.104886,-0.665714,0.002358,-0.001148,0.001493,0.003002,-0.001365
4,-0.024748,0.00729,0.008241,-0.007426,1.238193,0.597488,0.688099,1.075536,-0.005949,-0.030643,0.004356,0.005671,-0.007987


In [12]:
test_df.head()

Unnamed: 0,rrirpnxm_nt_0,rrirpnxm_lst15_0,rrirpnxm_lsthrx15_0,rrirpnxm_toxhr_0,relvol_nt_0,relvol_lst15_0,relvol_lsthrx15_0,relvol_toxhr_0,tonight
0,-0.008679,0.001378,-0.000874,-0.002152,1.545634,-0.12525,-0.166612,0.213301,-0.002419
1,0.001467,0.001382,-0.002988,0.01443,-1.695397,-2.074962,-0.545008,-0.614649,-0.007316
2,-0.006366,-0.005396,-0.004639,-0.00404,-2.200373,-0.73413,-0.002181,-1.813272,0.001317
3,0.018897,0.005662,0.007848,0.005312,0.031325,0.565102,1.972013,1.339065,-0.001082
4,-0.007411,0.000137,0.007318,0.012983,-0.37156,-0.427308,-0.199421,-0.577842,-0.004875


In [13]:
# rosy: making my own interacting terms

col_pairs = [
    ['relvol_nt_0', 'rrirpnxm_nt_0'],
    ["relvol_lst15_0", "rrirpnxm_lst15_0"],
    ["relvol_toxhr_0", "rrirpnxm_toxhr_0"],
    ["relvol_lsthrx15_0", "rrirpnxm_lsthrx15_0"],
]
interaction_terms_train_df = helper.get_df_with_interaction_terms(train_df, col_pairs)
interaction_terms_test_df = helper.get_df_with_interaction_terms(test_df, col_pairs)

### Transform the data if needed

#### Ordinary Least Squares

In [14]:
importlib.reload(helper)
ols_regression_model = helper.Regression('OLS')
model_attributes = ols_regression_model.execute(train_df, RESPONSE_NAME, test_df)
model_attributes

array([ 2.75945836e-02, -1.05238435e-02, -9.46327080e-03, -7.40189264e-03,
        3.25078674e-06,  4.38396523e-06, -1.22252426e-05, -4.84805211e-06,
        4.53573580e-05])

In [15]:
ols_regression_model.get_metric()

1. Weighted Correlation:
[[1.        0.0169153]
 [0.0169153 1.       ]]

2. Weighted Mean Return:
0.00016425082726671087

3. Weighted Scale Factor:
[0.45400396]



#### OLS w/ Interacting Terms

In [16]:
weights = helper.get_weights(train_df)
ols_interacting_model = helper.Regression('OLS')
ols_interacting_model.execute(interactingTerms_df, RESPONSE_NAME, interactingTerms_df, weights)
ols_interacting_model.get_metric()

1. Weighted Correlation:
[[1.         0.03062293]
 [0.03062293 1.        ]]

2. Weighted Mean Return:
0.00024943332656896097

3. Weighted Scale Factor:
[0.96172654]



#### LASSO

In [17]:
cv = 10

weights = helper.get_weights(train_df)
lasso_model = helper.Regression('LASSO')
lasso_model.execute(train_df, RESPONSE_NAME, test_df, cv, weights)
lasso_model.get_metric()

1. Weighted Correlation:
[[1.         0.01006693]
 [0.01006693 1.        ]]

2. Weighted Mean Return:
0.0001718312375909994

3. Weighted Scale Factor:
[0.34676277]



### XGBoost



In [18]:
importlib.reload(helper)
weights = helper.get_weights(train_df)
xgb_model = helper.Regression('XGBOOST')

In [19]:
model_attributes = xgb_model.execute(train_df, RESPONSE_NAME, test_df, weights)
xgb_model.get_metric()



1. Weighted Correlation:
[[1.         0.01196483]
 [0.01196483 1.        ]]

2. Weighted Mean Return:
0.0001346201191264064

3. Weighted Scale Factor:
[0.30265543]



get feature importance df

In [20]:
feature_importance = xgb_model.model.feature_importances_
feature_importance

array([ 4.2700906e+00, -1.0467501e+00, -1.4197532e+00, -8.0576015e-01,
        1.6635506e-03,  2.1685448e-03, -2.1877391e-03,  3.2627885e-04,
        2.0233319e-04], dtype=float32)

In [21]:
# Create a DataFrame to display feature importance
importance_df = pd.DataFrame({'Feature': train_df.columns, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
0,rrirpnxm_nt_0,4.270091
5,relvol_lst15_0,0.002169
4,relvol_nt_0,0.001664
7,relvol_toxhr_0,0.000326
8,tonight,0.000202
6,relvol_lsthrx15_0,-0.002188
3,rrirpnxm_toxhr_0,-0.80576
1,rrirpnxm_lst15_0,-1.04675
2,rrirpnxm_lsthrx15_0,-1.419753
