# 데이터 전처리

#### 머신러닝 코드 구조 <br>

![이미지](https://github.com/DA4BAM/dataset/blob/master/new_code.png?raw=true "code step1")

변수 정리: 필요없는 데이터 지움

## 00.환경준비

### 01.Import

In [1]:
# 라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 02.data loading


* 주가 데이터 가져오기

In [2]:
stock = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/SK.csv') 
stock.drop('AdjClose', axis=1, inplace=True)
#AdjClose 컬럼을 버린다.
# inplce=True는 앞에 stock=을 안쓰게 만듦 (stock 변수를 교체)


#환율
exch_rate = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/USD_KRW.csv')
exch_rate.drop(['open', 'high', 'low'], axis=1, inplace=True)
# 리스트 형태로 여러 개의 컬럼을 버림
exch_rate.rename(columns={'date':'Date', 'close':'exch_Close', 'diff':'exch_Diff'},inplace=True)
#컬럼 이름 변경 (rename으로)

In [3]:
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0


In [4]:
exch_rate.head()
#exch_Diff는 전날 과의 환율 등락

Unnamed: 0,Date,exch_Close,exch_Diff
0,2019-12-31,1155.1,-0.0025
1,2019-12-30,1158.0,-0.0015
2,2019-12-27,1159.7,-0.0023
3,2019-12-26,1162.3,0.0013
4,2019-12-25,1160.8,-0.0024


* 하나의 데이터로 결합

In [5]:
data = pd.merge(stock, exch_rate, how='left', on='Date')
# Date를 기준 삼아 합침
#겹치는 컬럼이 Date밖에 없음 -> Date를 기준 삼아
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


## 10.데이터 이해

### 11.둘러보기

In [6]:
# 상/하위 몇개 행을 살펴 봅시다.

data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


In [7]:
# 각 칼럼의 타입을 살펴 봅시다.

data.dtypes

#Date만 object로 되어있음 => 판다스에서 문자형태로 되어있는 경우에는 object 자료형
#Date를 무슨 데이터타입으로 바꿔야할까? datetime, 그래야 시간을 다룰 수 있게 된다.

Date           object
Open          float64
High          float64
Low           float64
Close         float64
Volume        float64
exch_Close    float64
exch_Diff     float64
dtype: object

In [8]:
# 행, 열의 개수를 살펴 봅시다.

data.shape #(행, 열)

(977, 8)

### 12.기초통계량

In [9]:
# 숫자형 변수들의 통계량을 살펴 봅니다.

data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,exch_Close,exch_Diff
count,972.0,972.0,972.0,972.0,972.0,977.0,977.0
mean,252080.761317,255116.255144,248993.312757,252241.769547,117131.1,1139.488025,3e-05
std,29305.098277,29550.396185,28885.988224,29268.540457,85008.66,39.095668,0.005343
min,193000.0,194500.0,189000.0,192500.0,0.0,1054.9,-0.0204
25%,227500.0,230375.0,224875.0,227875.0,76687.25,1117.9,-0.0032
50%,256250.0,259500.0,253500.0,256750.0,96966.0,1133.9,0.0001
75%,274500.0,277125.0,270625.0,274000.0,134895.0,1169.1,0.0034
max,331000.0,331000.0,321500.0,328500.0,1473645.0,1243.1,0.0265


### 13.탐색하기

## 20.데이터 준비


### 21.변수 정리

### 22.NaN 처리

In [10]:
# NaN 확인
data.isnull().sum()
# data.isna()  #이것도 동일한 코드

#5개씩 비어있음


Date          0
Open          5
High          5
Low           5
Close         5
Volume        5
exch_Close    0
exch_Diff     0
dtype: int64

In [11]:
data['Open'].isnull()
#Open 컬럼을 isnull 값을 찾아볼건데,
# 이렇게 보면 생략되잖아요.
# 근데 이 값이 True여야 NaN값이라는 건데, 한꺼번에 다 보면 찾기 어려움

0      False
1      False
2      False
3      False
4      False
       ...  
972    False
973    False
974    False
975    False
976    False
Name: Open, Length: 977, dtype: bool

In [12]:
# NaN 행들을 확인해 봅시다.
data.loc[data['Open'].isnull()]
#그래서 이렇게 조건을 걸어서 보는 거임
# 458, 463, 487, 700, 945가 비어있는 것을 확인할 수 있음.

# 왜냐, 환율 데이터 프레임에는 있는 값인데, SK주식에는 없는 값이라서.

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
458,2017-11-16,,,,,,1097.2,-0.0077
463,2017-11-23,,,,,,1084.0,-0.0019
487,2018-01-02,,,,,,1063.2,-0.0032
700,2018-11-15,,,,,,1127.9,-0.0042
945,2019-11-14,,,,,,1169.1,-0.0021


In [13]:
# 위에서 NaN인 값들 중에서 19년 11월 14일것만 떼어서 보는 중
data.loc[(data['Date']>='2019-11-12')&(data['Date']<='2019-11-20')]
# 2019-11-14 는 목요일. 원인은 모르나 NaN으로 채워져 있음.

#아직 날짜로 안바꾼 상황 -> 아직 문자열 데이터
# 근데 판다스에서는 이상/이하가 적용 됨

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,,,,,,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


In [14]:
# 어떻게 조치하는게 좋을까요?
# 방법1 : 행을 제거한다.

data1 = data.dropna(axis=0) #1로하면 곤란 (1로 하면 NaN이 있는 컬럼들이 다 삭제)
# 그래서 axis=1로 하면 exch_rate와 같아짐
data1.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [15]:
data1.loc[(data1['Date']>='2019-11-12')&(data1['Date']<='2019-11-20')]
#삭제하고 확인해보니, 945인덱스(19년 11월 14일)가 사라짐
# 아쉬운 점 : 환율 데이터 값의 나머지 값이 날라간 점

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


In [16]:
# 방법2 : 이전 값으로 채운다. 

data2 = data.fillna(method='ffill')
data2.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [17]:
data2.loc[(data2['Date']>='2019-11-12')&(data2['Date']<='2019-11-20')]
# 이번에는 14일이 살아 있음.
# 그런데, 944인덱스와 비교해보면, 동일하다는 것을 확인할 수 있다.

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,268000.0,269500.0,261500.0,263000.0,257602.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


In [18]:
# 방법3 : 앞뒤값의 중간값으로 채우기

data3 = data.interpolate(method='linear')
# linear 말고도 여러가지가 있기는 한데, 디폴트가 linear -> 그만큼 사람들이 많이 사용
data3.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [19]:
data3.loc[(data3['Date']>='2019-11-12')&(data3['Date']<='2019-11-20')]
# NaN값이 없어졌음.
# 945 인덱스의 Open, High, Low, Close, Volume 값이 위 아래 값들의 중간임
# exch_Close와 exch_Diff 는 그대로

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,266500.0,269750.0,263250.0,266500.0,228016.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


#### 실습 9 : NaN 조치

* Q1. 주가 데이터에서 NaN이 변수별로 몇 건씩 있는지 조사하시오.

In [20]:
data.isna().sum()
#data.isnull().sum() #동일

Date          0
Open          5
High          5
Low           5
Close         5
Volume        5
exch_Close    0
exch_Diff     0
dtype: int64

In [32]:
data.info()
# 977개 row중에서
# 972개가 non-null이라는 뜻 -> 이렇게도 nan값 확인 가능

<class 'pandas.core.frame.DataFrame'>
Int64Index: 977 entries, 0 to 976
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        977 non-null    object 
 1   Open        972 non-null    float64
 2   High        972 non-null    float64
 3   Low         972 non-null    float64
 4   Close       972 non-null    float64
 5   Volume      972 non-null    float64
 6   exch_Close  977 non-null    float64
 7   exch_Diff   977 non-null    float64
dtypes: float64(7), object(1)
memory usage: 68.7+ KB


* Q2. NaN 행을 제거합니다. 제거해서 별도 데이터프레임으로 저장.

In [21]:
tmp1 = data.dropna(axis=0)
tmp1.isna().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [25]:
tmp1.loc[(tmp1['Date']>='2019-11-12')&(tmp1['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


* Q3. NaN 행에 이후 행의 값으로 채우고, 별도 데이터프레임으로 저장.
    * method='bfill'

In [22]:
tmp2 = data.fillna(method='bfill')
tmp2.isna().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [27]:
tmp2.loc[(tmp2['Date']>='2019-11-12') & (tmp2['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,265000.0,270000.0,265000.0,270000.0,198430.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


* Q4. NaN 행에 값을 0으로 채우고, 별도 데이터프레임으로 저장.
    * .fillna(0)

In [23]:
tmp3 = data.fillna(0)
tmp3.isna().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [28]:
tmp3.loc[(tmp3['Date']>='2019-11-12')&(tmp3['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,0.0,0.0,0.0,0.0,0.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


* Q5. interpolate를 사용해보시오.
    * .interpolate(method='linear')

In [24]:
tmp4 = data.interpolate(method='linear')
tmp4.isna().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
exch_Close    0
exch_Diff     0
dtype: int64

In [31]:
tmp4.loc[(tmp4['Date']>='2019-11-12')&(tmp4['Date']<='2019-11-20')]

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
943,2019-11-12,271000.0,271000.0,268500.0,269000.0,168608.0,1166.5,0.0009
944,2019-11-13,268000.0,269500.0,261500.0,263000.0,257602.0,1171.5,0.0043
945,2019-11-14,266500.0,269750.0,263250.0,266500.0,228016.0,1169.1,-0.0021
946,2019-11-15,265000.0,270000.0,265000.0,270000.0,198430.0,1163.7,-0.0046
947,2019-11-18,270000.0,275000.0,269500.0,275000.0,185370.0,1167.2,0.0031
948,2019-11-19,274500.0,276500.0,272500.0,272500.0,222226.0,1168.4,0.001
949,2019-11-20,271500.0,274500.0,268500.0,272000.0,210122.0,1170.8,0.0021


### 23.Feature Engineering

* 내일의 주가에 영향을 주는 요인은 무엇을까요?

In [33]:
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089


#### 1)날짜 데이터 다루기

* 날짜 변수 만들기  
    - 날짜변수를 만드는 이유!  
        - 날짜로부터 추가 변수를 도출해내기 위해서 날짜 변수를 만듭니다.  
        - 만약 추가변수 도출이 끝나면, 날짜변수는 제거합니다.

In [34]:
data2.dtypes

Date           object
Open          float64
High          float64
Low           float64
Close         float64
Volume        float64
exch_Close    float64
exch_Diff     float64
dtype: object

In [35]:
# 문자열 형식을 날짜 형식으로 변환
data2['Date'] = pd.to_datetime(data2['Date']) #날짜 변수 만들기 위한 메소드
print(data2.dtypes)

Date          datetime64[ns]
Open                 float64
High                 float64
Low                  float64
Close                float64
Volume               float64
exch_Close           float64
exch_Diff            float64
dtype: object


* data2.Date.dt.dayofweek  : The day of the week with Monday=0, Sunday=6.
* data2.Date.dt.day_name() : 요일 이름
* data2.Date.dt.week : 1~53주
* data2.Date.dt.year
* data2.Date.dt.month

* 날짜로 부터 날짜요소 추출하기

In [36]:
# 요일을 추가해 봅시다. 

#.dt를 꼭 쳐줘야 함
# dayofweek : 월요일 0 ~ 일요일 6
data2['WeekDay'] = data2['Date'].dt.dayofweek
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,1
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,2
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,3
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,4


In [37]:
# 요일 이름으로 다시 추가.
data2['WeekDay'] = data2.Date.dt.day_name()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday


#### 2)이전 데이터 붙이기.  
- .shift( )   : 예) 전날 주가, 전날 환율
- .rolling( ) : 예)7일이동평균 주가

In [38]:
# 전날 주가를 추가합니다.
#기존에 있는 값을 하나 아래로 내려서 전날 주가 컬럼 생성
data2['Close_lag1'] = data2['Close'].shift() # default = 1
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0


In [39]:
# 7일 이동평균 주가
# 맨 위에부터 1개 데이터, 2개 데이터, 3개 데이터, ..., 7개 데이터 중 최댓값
# NaN 값때문에 float로 나옴
data2['Close_MA7_lag1'] = data2['Close'].rolling(7, min_periods=1).mean().shift()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0


#### 실습 10 : feature engineering

* data2 변수를 이용하여 문제를 푸시오

* Q1. 월 데이터를 추가해보시오

In [41]:
data2['Month'] = data2['Date'].dt.month

In [42]:
data2

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.000000,1
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.000000,1
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
972,2019-12-23,261500.0,263500.0,256500.0,261500.0,92109.0,1164.0,0.0040,Monday,258500.0,258071.428571,12
973,2019-12-24,260500.0,262500.0,260500.0,262000.0,70410.0,1163.5,-0.0004,Tuesday,261500.0,258357.142857,12
974,2019-12-26,264000.0,264000.0,261000.0,262500.0,412790.0,1162.3,0.0013,Thursday,262000.0,258857.142857,12
975,2019-12-27,259000.0,264500.0,256500.0,262500.0,122918.0,1159.7,-0.0023,Friday,262500.0,259214.285714,12


In [47]:
data2['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

* Q2. 전날 거래량 컬럼, 전날 환율 증감 컬럼을 만드시오.

In [44]:
data2['y_Volume'] = data2['Volume'].shift()
data2['y_exchDiff'] = data2['exch_Diff'].shift()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027


* Q3. "전날 주가 - 전전날 주가" 컬럼을 만드시오.

In [48]:
data2['y_exch_Diff']=data2['Close'].shift() - data2['Close'].shift(2)
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_exch_Diff,y_clopDiff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,,-8500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0


In [50]:
data2.rename(columns={"y_exch_Diff":"y_close_Diff"}, inplace=True)

* Q4. "전날 종가 - 전날 시가" 컬럼을 만드시오.

In [46]:
data2['y_clopDiff'] = data2['Close'].shift() - data2['Open'].shift()
data2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_exch_Diff,y_clopDiff
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,,
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,,-8500.0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,-0.0123,5000.0
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,0.0078,-2000.0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,-0.0109,3500.0


* Q5. 그 외 추가하고 싶은 변수를 1 ~ 3개 정도 더 추가해보세요.

### 24.Dummy Variable
* 범주형 변수를 숫자로 만드는 방법
* pd.get_dummies, pd.concat, (Pandas Dataframe).drop
* 불필요한 칼럼들 제거


In [55]:
# dummy variable

dumm_weekday = pd.get_dummies(data2['WeekDay'], drop_first=True, prefix="day")
# drop_first=False이면 맨 앞에 FRI가 붙음.
# 금요일 데이터는 어떡함 그럼?
# 월화수목이 모두 0이면 그 행은 금요일로 봐도 됨

In [56]:
dumm_weekday.head()
# 주가데이터이기 때문에 SAT, SUN 없음

Unnamed: 0,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0
4,0,0,0,0


In [57]:
data3 = pd.concat([data2, dumm_weekday], axis=1)
data3.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,exch_Close,exch_Diff,WeekDay,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
0,2016-01-04,243000.0,245000.0,234500.0,234500.0,173905.0,1190.4,0.0127,Monday,,,1,,,,,1,0,0,0
1,2016-01-05,236000.0,244000.0,234000.0,241000.0,182985.0,1190.8,0.0004,Tuesday,234500.0,234500.0,1,173905.0,0.0127,,-8500.0,0,0,1,0
2,2016-01-06,241000.0,243000.0,237500.0,239000.0,108574.0,1200.5,0.0082,Wednesday,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,2016-01-07,237000.0,243000.0,236000.0,240500.0,113376.0,1197.3,-0.0027,Thursday,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,2016-01-08,240500.0,242500.0,235000.0,241500.0,81557.0,1207.9,0.0089,Friday,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0


* 불필요한 변수를 제거합시다.

* 어떤 변수가 불필요한가요?  
    - 날짜
    - 전날로 뺀 변수의 원본
    - 가변수화 한 변수의 원본
    - 의미 없는 변수

In [58]:
# 칼럼삭제
drop_x = ['Date','Open','High','Low','Volume','exch_Close','exch_Diff','WeekDay']
data3.drop(drop_x, axis = 1, inplace = True) #범위화해서 없앴음
data3.head()

Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
0,234500.0,,,1,,,,,1,0,0,0
1,241000.0,234500.0,234500.0,1,173905.0,0.0127,,-8500.0,0,0,1,0
2,239000.0,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,240500.0,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,241500.0,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0


In [59]:
# shift를 하다보내 행에 NaN이 다시 들어갔습니다. dropna로 제거합시다.
data3 = data3.dropna(axis = 0)
data3.head()

Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
2,239000.0,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,240500.0,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,241500.0,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0
5,239000.0,241500.0,239300.0,1,81557.0,0.0089,1000.0,1000.0,1,0,0,0
6,237500.0,239000.0,239250.0,1,84152.0,-0.0026,-2500.0,1000.0,0,0,1,0


#### 실습 11 : dummy variable

* Q1. 주가 데이터에서 월에 대하여 가변수화 합니다.

In [69]:
dumm_month = pd.get_dummies(data3['Month'], drop_first=True, prefix="month")

#이렇게하면 원본에 있는 Month가 자동으로 없어짐
dumm_month = pd.get_dummies(data, columns=['Month'], drop_first=True)
dumm_month

KeyError: "None of [Index(['Month'], dtype='object')] are in the [columns]"

* Q2. 가변수한 데이터를 기존 데이터와 결합합니다. ( concat )

In [63]:
data4 = pd.concat([data3, dumm_month], axis=1)
#axis=0으로 하면 아래에 붙어버림

In [64]:
data4


Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
2,239000.0,241000.0,237750.000000,1,182985.0,0.0004,6500.0,5000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,240500.0,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,241500.0,240500.0,238750.000000,1,113376.0,-0.0027,1500.0,3500.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,239000.0,241500.0,239300.000000,1,81557.0,0.0089,1000.0,1000.0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,237500.0,239000.0,239250.000000,1,84152.0,-0.0026,-2500.0,1000.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,261500.0,258500.0,258071.428571,12,95196.0,-0.0033,1500.0,500.0,1,0,...,0,0,0,0,0,0,0,0,0,1
973,262000.0,261500.0,258357.142857,12,92109.0,0.0040,3000.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
974,262500.0,262000.0,258857.142857,12,70410.0,-0.0004,500.0,1500.0,0,1,...,0,0,0,0,0,0,0,0,0,1
975,262500.0,262500.0,259214.285714,12,412790.0,0.0013,500.0,-1500.0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
data4.columns #우리가 가지고있는 컬럼들을 모두 확인 가능
#month_1은 빠지고 나머지는 다 들어가 있음

Index(['Close', 'Close_lag1', 'Close_MA7_lag1', 'y_Volume', 'y_exchDiff',
       'y_close_Diff', 'y_clopDiff', 'day_Monday', 'day_Thursday',
       'day_Tuesday', 'day_Wednesday', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12'],
      dtype='object')

* Q3. 가변수화 하기 이전의 월 컬럼을 제거하세요.

In [66]:
data4.drop(columns=["Month"], inplace=True)

In [67]:
data4.head()

Unnamed: 0,Close,Close_lag1,Close_MA7_lag1,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
2,239000.0,241000.0,237750.0,182985.0,0.0004,6500.0,5000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,240500.0,239000.0,238166.666667,108574.0,0.0082,-2000.0,-2000.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,241500.0,240500.0,238750.0,113376.0,-0.0027,1500.0,3500.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,239000.0,241500.0,239300.0,81557.0,0.0089,1000.0,1000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,237500.0,239000.0,239250.0,84152.0,-0.0026,-2500.0,1000.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


##### 간결한 방식

In [None]:
data3 = pd.get_dummies(data3, columns=['Month'], drop_first=True)

In [None]:
data3.columns

In [None]:
data3.head()

--------------------------------
여기서 부터는 그냥 실행 해 볼 겁니다.  
데이터를 얼추(!) 준비했으니, 모델링까지 수행해 봅니다.

### 25.Data Split

sklearn의 데이터 분할 함수를 사용해 봅시다.
- sklearn은 머신러닝의 대표적인 라이브러리

* 요인, x, feature, 조작변수, 통제변수, 리스크백터, Input, (독립변수)
* 결과, y, target, label, Output, (종속변수)

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
# features와 target 분리
X = data3.drop('Close', axis=1) #종가(주가)를 없앰
y = data3.iloc[:, 0]

In [87]:
X.head() #Close 컬럼이 사라짐

Unnamed: 0,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
2,241000.0,237750.0,1,182985.0,0.0004,6500.0,5000.0,0,0,0,1
3,239000.0,238166.666667,1,108574.0,0.0082,-2000.0,-2000.0,0,1,0,0
4,240500.0,238750.0,1,113376.0,-0.0027,1500.0,3500.0,0,0,0,0
5,241500.0,239300.0,1,81557.0,0.0089,1000.0,1000.0,1,0,0,0
6,239000.0,239250.0,1,84152.0,-0.0026,-2500.0,1000.0,0,0,1,0


In [88]:
y.head() #시리즈형태로 Close 컬럼만 가져옴

2    239000.0
3    240500.0
4    241500.0
5    239000.0
6    237500.0
Name: Close, dtype: float64

In [89]:
# 전체에서 train : test = 7 : 3

# 위에서 만든 X와 y를 넣어줌
# 통상적으로 training set을 70~80% 정도
# test set은 20~30% 정도 줌.
# test set은 완전히 새로운 데이터(우리가 학습할 때 쓰지않은 데이터) => 모델 검증
# random_state은 난수 보정, 재현을 할 때 똑같이 하기 위하여 (랜덤 값 고정)
# 예를 들어 트레인데이터 70퍼센트라고 하면 0~69 데이터 => 트레인
# 근데 random_state 값 안주면 30퍼센트를 막 0~29 이런식으로 뽑을 수도 있어서
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)

In [90]:
train_x.shape, train_y.shape

((682, 11), (682,))

In [91]:
X.shape, y.shape #975-682 나머지 데이터는 테스트 시 사용

((975, 11), (975,))

### 26.Scaling features




In [92]:
max_n, min_n = train_x.max(), train_x.min()

In [93]:
train_x.describe()

Unnamed: 0,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,252123.900293,251888.807429,6.453079,118118.1,-4.4e-05,218.475073,331.378299,0.195015,0.196481,0.186217,0.217009
std,29147.663969,28767.189984,3.419517,79473.27,0.005437,4158.734985,3854.381054,0.396503,0.397628,0.389567,0.412511
min,192500.0,195642.857143,1.0,0.0,-0.0204,-18000.0,-17000.0,0.0,0.0,0.0,0.0
25%,227625.0,226875.0,3.0,77799.5,-0.003275,-2000.0,-2000.0,0.0,0.0,0.0,0.0
50%,257000.0,256214.285714,6.0,98143.0,0.0001,0.0,0.0,0.0,0.0,0.0,0.0
75%,273500.0,273321.428571,9.0,135021.2,0.0034,2500.0,2500.0,0.0,0.0,0.0,0.0
max,328500.0,319214.285714,12.0,1137778.0,0.0265,20000.0,18500.0,1.0,1.0,1.0,1.0


In [94]:
train_x_scale = (train_x - min_n) / (max_n - min_n)

train_x_scale.describe()
#min, max 값
# 각 feature 별 행의 값을 0과 1사이로 만들어줌.
# 이걸 간단하게 패키지화 한 게 sklearn.preprocessing의 MinaxScaler

Unnamed: 0,Close_lag1,Close_MA7_lag1,Month,y_Volume,y_exchDiff,y_close_Diff,y_clopDiff,day_Monday,day_Thursday,day_Tuesday,day_Wednesday
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.438411,0.45517,0.495734,0.103815,0.43403,0.479434,0.488208,0.195015,0.196481,0.186217,0.217009
std,0.214321,0.232798,0.310865,0.06985,0.115929,0.10944,0.108574,0.396503,0.397628,0.389567,0.412511
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.258272,0.252746,0.181818,0.068378,0.365139,0.421053,0.422535,0.0,0.0,0.0,0.0
50%,0.474265,0.490173,0.454545,0.086258,0.4371,0.473684,0.478873,0.0,0.0,0.0,0.0
75%,0.595588,0.628613,0.727273,0.118671,0.507463,0.539474,0.549296,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [78]:
from sklearn.preprocessing import MinMaxScaler

In [79]:
# 사용할 함수 선언
scaler = MinMaxScaler()

In [95]:
# 함수 만들고
scaler.fit(train_x)

# 변환시키기
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)


#위의 두 과정을 하나로 통합
# .fit_transform(test_x)
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

#MinMaxScale을 적용하고 나면 값들이 다 numpy array로 바뀜.
#그래서 describe()보고싶으면 df로 만들어야 함.


In [96]:
pd.DataFrame(train_x).describe()
#min, max가 다 0과 1이다.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.438411,0.45517,0.495734,0.103815,0.43403,0.479434,0.488208,0.195015,0.196481,0.186217,0.217009
std,0.214321,0.232798,0.310865,0.06985,0.115929,0.10944,0.108574,0.396503,0.397628,0.389567,0.412511
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.258272,0.252746,0.181818,0.068378,0.365139,0.421053,0.422535,0.0,0.0,0.0,0.0
50%,0.474265,0.490173,0.454545,0.086258,0.4371,0.473684,0.478873,0.0,0.0,0.0,0.0
75%,0.595588,0.628613,0.727273,0.118671,0.507463,0.539474,0.549296,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 27.Dataframe to Numpy array

## 30.모델링

### 31.import

In [97]:
# Linear Regression
from sklearn.linear_model import LinearRegression        # Linear Model
from sklearn.metrics   import mean_squared_error         # Metric  MSE
#회귀문제에서는 MSE를 지표로 많이 쓴다

### 32.모델선언

In [98]:
# 모델 선언
# 불러온 걸 모델로 선언해야함.
model = LinearRegression()

### 33.모델링(학습)

In [99]:
model.fit( train_x , train_y ) #여기에는 학습할 데이터를 x와 y 순서대로 넣음

LinearRegression()

### 34.예측

In [100]:
# 예측값을 뽑자.
test_pred = model.predict(test_x)
# 여기 안에 들어가는 거는 test_x
# test_y는? test_x를 통해서 만든 데이터와 비교할 대상.

In [102]:
test_pred

array([250869.30755201, 271884.25538105, 283795.46251348, 242763.51874178,
       233471.6214433 , 246524.52946258, 290279.66976395, 223527.13980618,
       261538.15200886, 212771.22910752, 279054.06242436, 270427.09366514,
       248453.15532396, 269825.02708287, 279528.07133084, 224259.05149666,
       275233.20970501, 224163.54972583, 223900.16537811, 216806.44995529,
       291401.88002455, 285680.79453869, 270875.1492215 , 226753.87245521,
       204549.50146001, 223759.69365693, 196039.99336285, 257620.1319335 ,
       256303.0108709 , 222646.8451545 , 309679.23251015, 241976.05698083,
       239875.89226223, 282257.53149666, 266642.89163825, 226732.09196786,
       299305.49197203, 272320.45699817, 266381.04006779, 231778.50777419,
       301744.08568661, 270260.60254409, 270091.59768867, 291852.2922613 ,
       263560.07293644, 238598.49821428, 231209.16061032, 257307.97128193,
       229552.37596945, 210365.57667367, 209429.76496471, 228110.46624295,
       207394.10930828, 2

In [103]:
test_pred.shape #293개의 row 데이터

(293,)

In [104]:
test_y.shape #293개의 실제 데이터

(293,)

### 35.평가

In [101]:
# test set에서의 성능 확인
#위에서 부른 mse를 쓰는 것임
# 실제값과 예측값 순서대로 넣음
# squared값이 True일 경우 MSE값(제곱된 값)을 리턴, 
# False일 경우 RMSE 값을 리턴 (제곱 풀어서 이 친구들의 단위에 맞춰주는 것)
mean_squared_error( test_y, test_pred, squared=False )

4033.586882584132

나온 결과의 의미는?
squared = False => Root Mean Squared Error
값을 해석하자면?

우리가 지금 주가 Close 데이터를 예측하고 있었음.
근데 주가 데이터는 시계열 데이터였어 (8/9, 8/10, .. 이런식의 날짜데이터)
아까 전처리에서 Date 없애고 이끌어냈음.
저 4033.58688... 의미는
내 모델의 예측한 주식값과 실제 주식값이
대충 4033 정도 차이가 난다는 것.