In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import math
from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [2]:
sys.path.insert(0, r'../../src')
from function_cache import *

In [3]:
T_0min = pd.read_csv('../../data/processed/price_before_close/T_0min.csv', index_col='Trading_Day', parse_dates=True)
TF_0min = pd.read_csv('../../data/processed/price_before_close/TF_0min.csv', index_col='Trading_Day', parse_dates=True)
T_5min = pd.read_csv('../../data/processed/price_before_close/T_5min.csv', index_col='Trading_Day', parse_dates=True)
TF_5min = pd.read_csv('../../data/processed/price_before_close/TF_5min.csv', index_col='Trading_Day', parse_dates=True)
T_10min = pd.read_csv('../../data/processed/price_before_close/T_10min.csv', index_col='Trading_Day', parse_dates=True)
TF_10min = pd.read_csv('../../data/processed/price_before_close/TF_10min.csv', index_col='Trading_Day', parse_dates=True)
T_15min = pd.read_csv('../../data/processed/price_before_close/T_15min.csv', index_col='Trading_Day', parse_dates=True)
TF_15min = pd.read_csv('../../data/processed/price_before_close/TF_15min.csv', index_col='Trading_Day', parse_dates=True)

In [4]:
T_features = pd.read_csv('../../data/processed/candidate_features/T_features.csv', index_col=0)
TF_features = pd.read_csv('../../data/processed/candidate_features/TF_features.csv', index_col=0)
T_spread_change = pd.read_csv('../../data/processed/different_ys/T_spread_change.csv', index_col=0)
TF_spread_change = pd.read_csv('../../data/processed/different_ys/TF_spread_change.csv', index_col=0)

In [5]:
features = pd.concat([T_features, TF_features])
spread_change = pd.concat([T_spread_change, TF_spread_change])

In [6]:
X = features.values
y = spread_change['0min_0min'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
alphas = [0.001, 0.002, 0.005, 0.01, 0.015, 0.02, 0.05, 0.1]

In [8]:
def lasso(X, y, feature_names, alphas):
    '''
    Takes in a list of alphas. Outputs a dataframe containing the coefficients of lasso regressions from each alpha.
    '''
    coef_df = pd.DataFrame(index=feature_names)
    coef_df.index.name = 'feature'
    pred_df = pd.DataFrame(index=features.index)
    pred_df.index.name = 'feature'
    
    for alpha in alphas:

        lasso = Lasso(alpha=alpha)
        lasso.fit(X, y)

        column_name = 'Alpha = %f' % alpha

        # Create a column of coefficient values
        coef_df[column_name] = lasso.coef_
        pred_df[column_name] = lasso.predict(X)
  
    coef_df = coef_df.replace(0, np.nan)
    
    return coef_df, pred_df

In [9]:
coef_df, pred_df = lasso(X, y, list(features.columns), alphas)

In [10]:
coef_df

Unnamed: 0_level_0,Alpha = 0.001000,Alpha = 0.002000,Alpha = 0.005000,Alpha = 0.010000,Alpha = 0.015000,Alpha = 0.020000,Alpha = 0.050000,Alpha = 0.100000
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5d_spread_change,-0.03958,-0.044372,-0.040157,-0.000778,,,,
10d_spread_change,-0.189397,-0.180176,-0.119424,,,,,
15d_spread_change,0.244216,0.22815,0.140678,,,,,
20d_spread_change,0.01837,0.021606,0.033092,,0.002011,0.005613,0.000631,
10d_act_price_exp_mva,0.009944,0.031015,0.050021,0.013561,0.010306,0.003146,,
-1d_corr_act_price_def_OI,,,,,,,,
-1d_corr_def_price_def_OI,-0.177268,-0.158584,-0.100379,-0.081949,-0.066705,-0.052994,,
10d_spread_std_divided_by_std,0.13402,0.108698,0.055459,0.035052,0.022921,0.011471,,
10d_current_price,-0.001669,,,,,,,
1d_z_diff_act_price_def_price,-0.094975,-0.099472,-0.096595,-0.082802,-0.069358,-0.057075,-0.010371,


In [11]:
pred_df

Unnamed: 0_level_0,Alpha = 0.001000,Alpha = 0.002000,Alpha = 0.005000,Alpha = 0.010000,Alpha = 0.015000,Alpha = 0.020000,Alpha = 0.050000,Alpha = 0.100000
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
T1509,0.380031,0.368227,0.346133,0.318713,0.310265,0.297212,0.156473,0.08302
T1512,0.619809,0.580046,0.481844,0.369319,0.34601,0.324021,0.202134,0.102206
T1603,0.361122,0.36875,0.373991,0.361281,0.363001,0.362924,0.24837,0.128672
T1606,0.06337,0.03049,0.03928,0.10802,0.134641,0.161307,0.163272,0.12548
T1609,-0.075527,-0.050917,-0.007346,-0.028891,-0.035547,-0.035818,-0.019834,0.031608
T1612,0.323959,0.331051,0.348546,0.299486,0.257504,0.224942,0.139829,0.10352
T1703,0.495404,0.483449,0.432675,0.376693,0.353227,0.333315,0.241788,0.112105
T1706,-0.448235,-0.411899,-0.316521,-0.231127,-0.197272,-0.159246,0.011914,0.044898
T1709,0.00807,-0.029425,-0.120441,-0.152246,-0.149604,-0.140268,-0.023796,0.054202
T1712,0.110075,0.102102,0.085524,0.059588,0.040232,0.036255,0.026573,0.07084


In [12]:
coef_df.to_csv('Lasso_Reg.csv')