# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

In [2]:
data = pd.read_csv("./data/ETFs_main.csv")
data.head()

Unnamed: 0,Dates,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
0,2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
1,2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975
2,2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22
3,2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035
4,2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771 entries, 0 to 2770
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Dates         2771 non-null   object 
 1   CLOSE_SPY     2771 non-null   float64
 2   OPEN          2771 non-null   float64
 3   HIGH          2771 non-null   float64
 4   LOW           2771 non-null   float64
 5   VOLUME        2771 non-null   float64
 6   CLOSE_GLD     2771 non-null   float64
 7   CLOSE_FXY     2771 non-null   float64
 8   CLOSE_T10Y2Y  2771 non-null   float64
 9   CLOSE_TED     2771 non-null   float64
 10  CLOSE_USO     2771 non-null   float64
 11  CLOSE_UUP     2771 non-null   float64
 12  CLOSE_VIX     2771 non-null   float64
 13  CLOSE_VWO     2771 non-null   float64
dtypes: float64(13), object(1)
memory usage: 303.2+ KB


In [4]:
data['Dates'] = pd.to_datetime(data['Dates'])
data = data.set_index('Dates')
data.head()

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96


# Technical Indicator
* MA_45: 45일 단순 이동평균(Simple Moving Average): 특정 자산의 45일간 평균 종가
* VMA_45: 45일 거래량 (Volume Moving Average): 지난 45일간의 평균 거래량
* RSI_14: 14일 상대강도지수(Relative Strength Index): 14일 동안의 자산 가격 변동을 바탕으로 과매수 또는 과매도 상태를 평가하는 지표

In [5]:
days = 45

In [6]:
data.columns

Index(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_GLD', 'CLOSE_FXY',
       'CLOSE_T10Y2Y', 'CLOSE_TED', 'CLOSE_USO', 'CLOSE_UUP', 'CLOSE_VIX',
       'CLOSE_VWO'],
      dtype='object')

In [7]:
ma = pd.Series(data['CLOSE_SPY'].rolling(window=days).mean(), name='MA_' + str(days))
ma

Dates
2007-02-20           NaN
2007-02-21           NaN
2007-02-22           NaN
2007-02-23           NaN
2007-02-26           NaN
                 ...    
2018-12-20    269.767778
2018-12-21    269.018889
2018-12-24    267.995333
2018-12-27    267.275778
2018-12-28    266.639111
Name: MA_45, Length: 2771, dtype: float64

In [8]:
data = data.join(ma)
data.head()

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975,
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22,
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96,


In [9]:
# 거래량 이동평균 vma
vma = pd.Series(data['VOLUME'].rolling(days).mean(), name="VMA_" + str(days))
vma

Dates
2007-02-20             NaN
2007-02-21             NaN
2007-02-22             NaN
2007-02-23             NaN
2007-02-26             NaN
                  ...     
2018-12-20    1.240592e+08
2018-12-21    1.274610e+08
2018-12-24    1.281067e+08
2018-12-27    1.297876e+08
2018-12-28    1.301996e+08
Name: VMA_45, Length: 2771, dtype: float64

In [10]:
data = data.join(vma)
data.head()

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975,,
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22,,
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96,,


In [11]:
# diff 를 통해 변화량 구하기
delta = data['CLOSE_SPY'].diff()
delta

Dates
2007-02-20      NaN
2007-02-21    -0.06
2007-02-22    -0.11
2007-02-23    -0.57
2007-02-26    -0.13
              ...  
2018-12-20    -4.09
2018-12-21    -6.47
2018-12-24    -6.36
2018-12-27    13.73
2018-12-28    -0.32
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [12]:
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
gain

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    0.515000
2018-12-21    0.515000
2018-12-24    0.395714
2018-12-27    1.115714
2018-12-28    1.115714
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [13]:
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
loss

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    2.472857
2018-12-21    2.892143
2018-12-24    3.346429
2018-12-27    3.346429
2018-12-28    2.722857
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [14]:
RS = gain / loss
RS

Dates
2007-02-20         NaN
2007-02-21         NaN
2007-02-22         NaN
2007-02-23         NaN
2007-02-26         NaN
                ...   
2018-12-20    0.208261
2018-12-21    0.178069
2018-12-24    0.118250
2018-12-27    0.333404
2018-12-28    0.409759
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [15]:
RSI = 100 - ( 100 / (1 + RS))
RSI

Dates
2007-02-20          NaN
2007-02-21          NaN
2007-02-22          NaN
2007-02-23          NaN
2007-02-26          NaN
                ...    
2018-12-20    17.236433
2018-12-21    15.115304
2018-12-24    10.574537
2018-12-27    25.004002
2018-12-28    29.065873
Name: CLOSE_SPY, Length: 2771, dtype: float64

In [16]:
RSI.name = 'RSI_14' 
RSI

Dates
2007-02-20          NaN
2007-02-21          NaN
2007-02-22          NaN
2007-02-23          NaN
2007-02-26          NaN
                ...    
2018-12-20    17.236433
2018-12-21    15.115304
2018-12-24    10.574537
2018-12-27    25.004002
2018-12-28    29.065873
Name: RSI_14, Length: 2771, dtype: float64

In [17]:
data = data.join(RSI)
data.head()

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,,
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975,,,
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22,,,
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,,
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96,,,


In [18]:
data.loc[:, 'pct_change'] = data['CLOSE_SPY'].pct_change()
data.head()

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,,,
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975,,,,-0.000411
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22,,,,-0.000754
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,,,-0.003908
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96,,,,-0.000895


In [19]:
data.loc[:,'target'] = np.where(data['pct_change'] > 0, 1, 0)
data.head(20)

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-02-20,146.04,145.56,146.2,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,,,,0
2007-02-21,145.98,145.61,146.07,145.0,63971500.0,67.28,82.9,2.3653,0.32,49.86,25.12,10.2,39.975,,,,-0.000411,0
2007-02-22,145.87,146.05,146.42,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.22,,,,-0.000754,0
2007-02-23,145.3,145.74,145.79,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,,,-0.003908,0
2007-02-26,145.17,145.83,145.95,145.0,69320062.0,68.1,83.08,2.3795,0.31,50.9,25.04,11.15,39.96,,,,-0.000895,0
2007-02-27,139.5,143.88,144.2,139.0,271580281.0,65.41,84.8,2.3463,0.37,49.95,24.9,18.31,37.045,,,,-0.039058,0
2007-02-28,140.93,140.39,141.98,140.0,177535297.0,66.48,84.62,2.3784,0.34,51.18,24.9401,15.42,37.78,,,,0.010251,1
2007-03-02,138.67,140.05,140.66,139.0,162572406.0,63.71,85.85,2.3624,0.37,51.01,24.96,18.61,36.6,,,,-0.016036,0
2007-03-05,137.35,137.93,139.58,137.0,143583406.0,62.93,86.45,2.343,0.36,49.62,25.12,19.63,35.55,,,,-0.009519,0
2007-03-06,139.7,138.78,140.12,138.0,143328406.0,64.15,85.81,2.35,0.35,50.32,25.1,15.96,37.25,,,,0.01711,1


In [20]:
# 다음날 예측을 위한 타겟 변수 shift
data['target'] = data['target'].shift(-1)
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055,,,,,0.0
2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975,,,,-0.000411,0.0
2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220,,,,-0.000754,0.0
2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035,,,,-0.003908,0.0
2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960,,,,-0.000895,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,1.0
2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900,267.275778,1.297876e+08,25.004002,0.058590,0.0


In [21]:
data = data.dropna()
data

Unnamed: 0_level_0,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14,pct_change,target
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2007-04-27,149.53,149.09,149.740,149.0,106984094.0,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685,-0.000802,0.0
2007-04-30,148.29,149.64,149.740,148.0,100874203.0,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720,-0.008293,1.0
2007-05-02,149.54,148.90,149.950,149.0,87129805.0,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288,0.008429,1.0
2007-05-03,150.35,149.97,150.400,149.0,87204945.0,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579,0.005417,1.0
2007-05-04,150.92,150.75,151.120,150.0,96408930.0,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765,0.003791,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,251.26,255.17,259.400,249.0,214992797.0,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250,-0.016056,0.0
2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433,-0.016278,0.0
2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304,-0.026176,0.0
2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537,-0.026423,1.0


In [22]:
data.columns

Index(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_GLD', 'CLOSE_FXY',
       'CLOSE_T10Y2Y', 'CLOSE_TED', 'CLOSE_USO', 'CLOSE_UUP', 'CLOSE_VIX',
       'CLOSE_VWO', 'MA_45', 'VMA_45', 'RSI_14', 'pct_change', 'target'],
      dtype='object')

In [23]:
X = data.drop(['CLOSE_SPY', 'OPEN', 'HIGH', 'LOW', 'VOLUME','pct_change','target'], axis=1)
y = data['target']

In [24]:
X

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-04-27,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685
2007-04-30,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720
2007-05-02,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288
2007-05-03,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579
2007-05-04,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.890,270.407333,1.225288e+08,30.487250
2018-12-20,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.180,269.767778,1.240592e+08,17.236433
2018-12-21,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.870,269.018889,1.274610e+08,15.115304
2018-12-24,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.320,267.995333,1.281067e+08,10.574537


In [25]:
y

Dates
2007-04-27    0.0
2007-04-30    1.0
2007-05-02    1.0
2007-05-03    1.0
2007-05-04    0.0
             ... 
2018-12-19    0.0
2018-12-20    0.0
2018-12-21    0.0
2018-12-24    1.0
2018-12-27    0.0
Name: target, Length: 2726, dtype: float64

In [26]:
y.value_counts()

target
1.0    1471
0.0    1255
Name: count, dtype: int64

# Holdout
* 시계열 데이터이기 때문에 홀드아웃시 날짜가 섞이면 안됨
* train_test_split 옵션에서 shuffle 옵션을 반드시 False

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=3)

In [29]:
X_train

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2007-04-27,67.56,83.7300,2.4474,0.55,51.84,24.54,12.45,41.750,143.551556,1.106696e+08,83.438685
2007-04-30,67.09,83.7166,2.4361,0.57,51.24,24.49,14.22,40.935,143.601556,1.116466e+08,70.956720
2007-05-02,66.66,83.3800,2.4366,0.59,49.59,24.66,13.08,42.020,143.680667,1.121613e+08,79.237288
2007-05-03,67.49,83.1100,2.4346,0.60,49.28,24.69,13.09,42.435,143.780222,1.123421e+08,79.604579
2007-05-04,68.19,83.2300,2.4006,0.60,48.30,24.60,12.91,42.595,143.905111,1.128853e+08,79.411765
...,...,...,...,...,...,...,...,...,...,...,...
2015-06-15,113.73,78.7000,1.8717,0.26,20.15,24.87,15.39,41.130,210.623089,9.845141e+07,44.292893
2015-06-16,113.32,78.7000,1.9052,0.28,20.26,24.92,14.81,41.220,210.673644,9.836155e+07,40.627391
2015-06-17,113.85,78.6800,1.9232,0.28,20.21,24.70,14.50,41.480,210.711200,9.927621e+07,42.900532
2015-06-18,115.32,79.0300,1.9083,0.27,20.36,24.62,13.19,41.850,210.772089,1.013461e+08,55.840456


In [30]:
X_test

Unnamed: 0_level_0,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO,MA_45,VMA_45,RSI_14
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-06-22,113.64,78.7100,1.9294,0.27,20.16,24.73,12.74,42.01,210.863644,1.024936e+08,51.610942
2015-06-23,112.89,78.3900,1.9415,0.27,20.47,25.02,12.11,42.39,210.899422,1.018036e+08,50.374065
2015-06-24,112.59,78.4199,1.9277,0.27,20.21,24.97,13.26,42.17,210.902311,1.023230e+08,51.174869
2015-06-25,112.44,78.5800,1.9315,0.27,19.98,24.93,14.01,42.02,210.944756,1.002339e+08,50.280025
2015-06-26,112.56,78.4200,1.9364,0.27,19.98,25.00,14.02,41.25,210.944089,1.005003e+08,54.520918
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-19,117.43,84.8300,1.7824,0.44,10.02,25.97,25.58,37.89,270.407333,1.225288e+08,30.487250
2018-12-20,119.24,85.8700,1.7807,0.48,9.72,25.77,28.38,38.18,269.767778,1.240592e+08,17.236433
2018-12-21,118.72,85.8700,1.7651,0.48,9.57,25.94,30.11,37.87,269.018889,1.274610e+08,15.115304
2018-12-24,120.02,86.5500,1.7505,0.40,9.29,25.55,36.07,37.32,267.995333,1.281067e+08,10.574537


# Boosting Model

In [31]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [32]:
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, random_state=3)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.48      0.75      0.59       384
         1.0       0.57      0.29      0.39       434

    accuracy                           0.51       818
   macro avg       0.53      0.52      0.49       818
weighted avg       0.53      0.51      0.48       818



# Bagging Model

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=3, n_jobs=-1, bootstrap=False)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.49      0.44      0.46       384
         1.0       0.54      0.59      0.56       434

    accuracy                           0.52       818
   macro avg       0.51      0.51      0.51       818
weighted avg       0.52      0.52      0.52       818



# Hyperparameter Tuning

In [45]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
import os
import json

In [68]:
# RandomForestRegressor에 맞는 하이퍼파라미터 설정
ts_splited = TimeSeriesSplit(n_splits=5).split(X_train)
params = {
    'bootstrap': [True, False],  # True 또는 False 모두 사용
    'n_estimators': np.arange(50, 1000, 50),  # 50에서 1000까지 50 단위로 범위 확장
    'max_depth': np.arange(2, 20, 2),  # 2에서 20까지 2 단위로 범위 확장
    'min_samples_leaf': np.arange(1, 10),  # 리프 노드의 최소 샘플 수 1에서 10까지
    'min_samples_split': np.arange(2, 20, 2),  # 노드 분할 최소 샘플 수 2에서 20까지 2 단위
    'max_features': ['sqrt', 'log2', None],  # 특성 수: sqrt, log2, 모든 특성(None)
    'criterion': ['squared_error', 'absolute_error', 'poisson', 'friedman_mse'],  # 회귀 문제에 맞는 손실 함수
    'random_state': [42]  # 재현성을 위한 랜덤 시드 설정
}

In [69]:
# 폴더와 파일 경로 설정
params_folder = "best_params"
params_file = os.path.join(params_folder, "best_params.json")

# 폴더가 없으면 생성
if not os.path.exists(params_folder):
    os.makedirs(params_folder)

# 이미 저장된 최적 파라미터가 있는지 확인
if os.path.exists(params_file):
    # 파일이 있으면 불러오기
    with open(params_file, 'r') as file:
        best_params = json.load(file)
    print("Previously saved best parameters loaded: ", best_params)

else:
    # 최적 파라미터가 없으면 RandomizedSearchCV 수행
    random_cv = RandomizedSearchCV(RandomForestRegressor(), param_distributions=params, n_iter=100, cv=ts_splited, n_jobs=-1, random_state=42)
    random_cv.fit(X_train, y_train)
    
    # 최적의 파라미터와 성능 출력
    best_params = random_cv.best_params_
    best_score = random_cv.best_score_
    
    print("Best parameters found: ", best_params)
    print("Best score: ", best_score)
    
    # NumPy 타입을 Python 기본 타입으로 변환
    best_params = {key: int(value) if isinstance(value, np.int64) else value for key, value in best_params.items()}
    
    # 최적의 파라미터를 파일로 저장
    with open(params_file, 'w') as file:
        json.dump(best_params, file)
    print(f"Best parameters saved to {params_file}")

Best parameters found:  {'random_state': 42, 'n_estimators': np.int64(600), 'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(8), 'max_features': 'log2', 'max_depth': np.int64(2), 'criterion': 'squared_error', 'bootstrap': True}
Best score:  -0.004229583145444504
Best parameters saved to best_params/best_params.json


# Prediction

In [75]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
import json

In [71]:
# 1. 데이터 전처리 (정규화)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [77]:
# 2. 최적 파라미터 불러오기
params_folder = "best_params"
params_file = os.path.join(params_folder, "best_params.json")

if os.path.exists(params_file):
    with open(params_file, 'r') as file:
        best_params = json.load(file)
else:
    raise FileNotFoundError("최적 파라미터가 없습니다. RandomizedSearchCV를 실행하세요.")

# 3. 모델 생성 (최적 파라미터 적용)
model = RandomForestRegressor(**best_params)

# 4. 교차 검증 (TimeSeriesSplit 사용)
tscv = TimeSeriesSplit(n_splits=5)

cv_scores_rmse = []  # RMSE 저장용 리스트
cv_scores_r2 = []    # R² 저장용 리스트

for train_idx, val_idx in tscv.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    model.fit(X_train_fold, y_train_fold)
    y_pred_fold = model.predict(X_val_fold)

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred_fold))
    cv_scores_rmse.append(rmse)
    
    # R² 계산
    r2 = r2_score(y_val_fold, y_pred_fold)
    cv_scores_r2.append(r2)

# Cross-validation에서 계산한 RMSE와 R² 값 출력
print("All Cross-Validation RMSE Scores:", cv_scores_rmse)
print("All Cross-Validation R² Scores:", cv_scores_r2)

# 평균도 함께 출력
print(f"Mean Cross-Validation RMSE: {np.mean(cv_scores_rmse):.4f}")
print(f"Mean Cross-Validation R²: {np.mean(cv_scores_r2):.4f}")

  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]


All Cross-Validation RMSE Scores: [np.float64(0.500334418382045), np.float64(0.4987088084937337), np.float64(0.49930254025729287), np.float64(0.4964025325518916), np.float64(0.49682090485166214)]
All Cross-Validation R² Scores: [-0.011581579769614603, -0.007757268164289677, -0.0011722835772076667, -0.003160967211484733, 0.002575855876692845]
Mean Cross-Validation RMSE: 0.4983
Mean Cross-Validation R²: -0.0042


In [79]:
# 모델 학습 및 예측
model.fit(X_train_scaled, y_train)
y_pred_test = model.predict(X_test_scaled)

# RMSE 계산
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
# R² 계산
test_r2 = r2_score(y_test, y_pred_test)

# 결과 출력
print(f"Test set RMSE: {test_rmse:.4f}")
print(f"Test set R²: {test_r2:.4f}")

Test set RMSE: 0.4999
Test set R²: -0.0033
