# Load Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel

import pickle

# load dataset
data = pd.read_csv('./BTC_sum_plus_nn_features.csv', nrows=30000)


# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]

# Remove Row Data to Selection

In [2]:
print("Before remove dv1 threshold : " , data.shape)

# 'dv1_realized_volatility' 열의 25%와 75% 백분위수 계산
percentile_25 = data['dv1_realized_volatility'].quantile(0.25)
percentile_75 = data['dv1_realized_volatility'].quantile(0.75)

# 0~25%와 75~100% 범위 밖의 행 제거
data = data[(data['dv1_realized_volatility'] <= percentile_25) | (data['dv1_realized_volatility'] >= percentile_75)]

print("After remove dv1 threshold : " , data.shape)

Before remove dv1 threshold :  (30000, 14686)
After remove dv1 threshold :  (15000, 14686)


In [3]:
y = data['dv1_realized_volatility']
X = data.drop('dv1_realized_volatility', axis=1)

# Feature Selection (0)
## Remove Corrlative Features

In [5]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        if i % 10 == 0:
            print("progressed : ", (i/len(corr_matrix.columns)*100))        
        for j in range(i) :
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

# note that we reduce the correlation threshold
# to remove more features
corr_features = correlation(X.sample(n=2000, random_state=42), 0.9)
print('correlated features: ', len(set(corr_features)) )

progressed :  0.0
progressed :  0.06809669731018045
progressed :  0.1361933946203609
progressed :  0.20429009193054137
progressed :  0.2723867892407218
progressed :  0.34048348655090227
progressed :  0.40858018386108275
progressed :  0.4766768811712632
progressed :  0.5447735784814436
progressed :  0.6128702757916241
progressed :  0.6809669731018045
progressed :  0.7490636704119851
progressed :  0.8171603677221655
progressed :  0.8852570650323459
progressed :  0.9533537623425264
progressed :  1.0214504596527068
progressed :  1.0895471569628872
progressed :  1.1576438542730678
progressed :  1.2257405515832482
progressed :  1.2938372488934287
progressed :  1.361933946203609
progressed :  1.4300306435137897
progressed :  1.4981273408239701
progressed :  1.5662240381341503
progressed :  1.634320735444331
progressed :  1.7024174327545116
progressed :  1.7705141300646918
progressed :  1.8386108273748722
progressed :  1.906707524685053
progressed :  1.974804221995233
progressed :  2.042900919

progressed :  17.228464419475657
progressed :  17.296561116785835
progressed :  17.364657814096017
progressed :  17.432754511406195
progressed :  17.500851208716377
progressed :  17.56894790602656
progressed :  17.637044603336737
progressed :  17.70514130064692
progressed :  17.7732379979571
progressed :  17.84133469526728
progressed :  17.90943139257746
progressed :  17.97752808988764
progressed :  18.04562478719782
progressed :  18.113721484508
progressed :  18.181818181818183
progressed :  18.24991487912836
progressed :  18.318011576438543
progressed :  18.38610827374872
progressed :  18.454204971058903
progressed :  18.522301668369085
progressed :  18.590398365679263
progressed :  18.658495062989445
progressed :  18.726591760299627
progressed :  18.794688457609805
progressed :  18.862785154919987
progressed :  18.930881852230165
progressed :  18.998978549540347
progressed :  19.06707524685053
progressed :  19.13517194416071
progressed :  19.20326864147089
progressed :  19.271365338

progressed :  34.38883214164113
progressed :  34.45692883895131
progressed :  34.52502553626149
progressed :  34.59312223357167
progressed :  34.66121893088185
progressed :  34.72931562819203
progressed :  34.797412325502215
progressed :  34.86550902281239
progressed :  34.93360572012257
progressed :  35.00170241743275
progressed :  35.069799114742935
progressed :  35.13789581205312
progressed :  35.2059925093633
progressed :  35.27408920667347
progressed :  35.34218590398366
progressed :  35.41028260129384
progressed :  35.47837929860402
progressed :  35.5464759959142
progressed :  35.614572693224375
progressed :  35.68266939053456
progressed :  35.75076608784474
progressed :  35.81886278515492
progressed :  35.8869594824651
progressed :  35.95505617977528
progressed :  36.023152877085465
progressed :  36.09124957439564
progressed :  36.15934627170582
progressed :  36.227442969016
progressed :  36.295539666326185
progressed :  36.36363636363637
progressed :  36.43173306094654
progress

progressed :  51.753489955737145
progressed :  51.821586653047326
progressed :  51.88968335035751
progressed :  51.95778004766769
progressed :  52.02587674497787
progressed :  52.093973442288046
progressed :  52.16207013959823
progressed :  52.23016683690841
progressed :  52.29826353421859
progressed :  52.36636023152877
progressed :  52.43445692883895
progressed :  52.50255362614913
progressed :  52.57065032345931
progressed :  52.63874702076949
progressed :  52.706843718079675
progressed :  52.77494041538985
progressed :  52.84303711270003
progressed :  52.91113381001021
progressed :  52.979230507320395
progressed :  53.04732720463058
progressed :  53.11542390194075
progressed :  53.18352059925093
progressed :  53.251617296561115
progressed :  53.3197139938713
progressed :  53.38781069118148
progressed :  53.45590738849167
progressed :  53.524004085801835
progressed :  53.592100783112016
progressed :  53.6601974804222
progressed :  53.72829417773238
progressed :  53.79639087504257
pr

progressed :  69.18624446714334
progressed :  69.25434116445352
progressed :  69.3224378617637
progressed :  69.39053455907388
progressed :  69.45863125638407
progressed :  69.52672795369425
progressed :  69.59482465100443
progressed :  69.66292134831461
progressed :  69.73101804562478
progressed :  69.79911474293496
progressed :  69.86721144024514
progressed :  69.93530813755532
progressed :  70.0034048348655
progressed :  70.07150153217569
progressed :  70.13959822948587
progressed :  70.20769492679605
progressed :  70.27579162410623
progressed :  70.34388832141642
progressed :  70.4119850187266
progressed :  70.48008171603676
progressed :  70.54817841334695
progressed :  70.61627511065713
progressed :  70.68437180796732
progressed :  70.7524685052775
progressed :  70.82056520258767
progressed :  70.88866189989785
progressed :  70.95675859720804
progressed :  71.02485529451822
progressed :  71.0929519918284
progressed :  71.16104868913857
progressed :  71.22914538644875
progressed : 

progressed :  86.7551923731699
progressed :  86.82328907048009
progressed :  86.89138576779027
progressed :  86.95948246510045
progressed :  87.02757916241062
progressed :  87.0956758597208
progressed :  87.16377255703098
progressed :  87.23186925434116
progressed :  87.29996595165134
progressed :  87.36806264896153
progressed :  87.43615934627171
progressed :  87.50425604358189
progressed :  87.57235274089207
progressed :  87.64044943820225
progressed :  87.70854613551244
progressed :  87.7766428328226
progressed :  87.84473953013278
progressed :  87.91283622744297
progressed :  87.98093292475315
progressed :  88.04902962206333
progressed :  88.11712631937351
progressed :  88.1852230166837
progressed :  88.25331971399387
progressed :  88.32141641130406
progressed :  88.38951310861424
progressed :  88.4576098059244
progressed :  88.52570650323459
progressed :  88.59380320054477
progressed :  88.66189989785495
progressed :  88.72999659516513
progressed :  88.79809329247531
progressed : 

In [8]:
# Set을 List로 변환
corr_features = list(corr_features)

# List를 Pickle로 저장
with open('corr_list_2.pkl', 'wb') as f:
    pickle.dump(corr_features, f)

# 저장된 Pickle 파일을 로드하는 방법
with open('corr_list_2.pkl', 'rb') as f:
    corr_features = pickle.load(f)

print(len(corr_features))

10725


In [9]:
X.drop(labels=corr_features, axis=1, inplace=True)

In [10]:
X.shape

(15000, 3960)

# Feature Selction A

## Exhaustive Feature Selection

This exhaustive feature selection algorithm is a wrapper approach for brute-force evaluation of feature subsets; the best subset is selected by optimizing a specified performance metric given an arbitrary regressor or classifier.

In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [None]:
# exhaustive search

# in order to shorter search time for the demonstration
# i will ask the algorithm to try all possible 10 and 11
# feature combinations

# if you have access to a multicore or distributed computer
# system you can try more greedy searches

efs = EFS(RandomForestRegressor(n_estimators=5,
                                n_jobs=4,
                                random_state=0,
                                max_depth=2),
          min_features=1,
          max_features=2,
          scoring='r2',
          print_progress=True,
          cv=2)

efs = efs.fit(X, y)

In [None]:
data.columns[list(efs.best_idx_)]

# Feature Selction B

## Embedded Methods: Linear Model Coefficients

train a Linear regression model and select features with higher coefficients.

In [None]:
# the features in the house dataset are in different scales
# so we train a scaler to scale them

scaler = StandardScaler()
scaler.fit(X)

# the LinearRegression object from sklearn is a non-regularised
# linear method. It fits by matrix multiplication and not 
# gradient descent.
# therefore we don't need to specify penalty and other parameters

sel_ = SelectFromModel(LinearRegression())

sel_.fit(scaler.transform(X), y)

# let's count the number of variables selected
selected_feat = X.columns[(sel_.get_support())]

# 저장할 파일 이름
file_name = "selected_feat_linear_coeff.pkl"

# selected_feat 변수를 pkl 파일로 저장
with open(file_name, 'wb') as file:
    pickle.dump(selected_feat, file)

print(f"{file_name} 파일이 저장되었습니다.")

print(len(selected_feat))


# and now, let's compare the  amount of selected features
# with the amount of features which coefficient is above the
# mean coefficient, to make sure we understand the output of
# sklearn

print('total features: {}'.format((X.shape[1])))

print('selected features: {}'.format(len(selected_feat)))

print(
    'features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(
            np.abs(sel_.estimator_.coef_) > np.abs(
                sel_.estimator_.coef_).mean())))

In [None]:
selected_feat

# Feature Selction C

## Embedded Methods: Tree Importance

train a Linear regression model and select features with higher coefficients.

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# RandomForestRegressor 모델 학습
model = RandomForestRegressor(n_estimators=128, random_state=42, verbose=True)
model.fit(X, y)

# 특성 중요도 출력
feature_importances = model.feature_importances_
print("Feature Importances:", feature_importances)

# 특성 선택: 중요도가 높은 순서대로 특성 선택 (임계값 설정 가능)
selected_features_indices = feature_importances.argsort()[::-1]  # 중요도 내림차순 정렬
selected_features = X.columns[selected_features_indices]
print("Selected Features:", selected_features)

In [None]:
# 저장할 파일 이름
file_name = "selected_feat_ranfo.pkl"

# selected_feat 변수를 pkl 파일로 저장
with open(file_name, 'wb') as file:
    pickle.dump(selected_features, file)

print(f"{file_name} 파일이 저장되었습니다.")

print(len(selected_features))

In [None]:
# 리스트를 DataFrame으로 변환
data = {'Name': list(selected_features), 'Importance': sorted(list(feature_importances), reverse=True)}
df = pd.DataFrame(data)

# DataFrame 출력
df.head(50)