In [1]:
import time
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# 1. 데이터 로드

In [2]:
import pickle
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)
    

# 2. 업종별 데이터프레임 합치기

In [3]:
df_concat_sector = {}
for sector_name, df_dict in df_sector.items():
    sector_df = pd.concat(df_dict,axis=0)
    sector_df.reset_index(drop=True, inplace=True)
    df_concat_sector[sector_name] = sector_df

In [4]:
df_concat_sector['Food'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13239 entries, 0 to 13238
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   일자      13239 non-null  datetime64[ns]
 1   종가      13239 non-null  int32         
 2   거래량     13239 non-null  int32         
 3   시가      13239 non-null  int32         
 4   고가      13239 non-null  int32         
 5   저가      13239 non-null  int32         
dtypes: datetime64[ns](1), int32(5)
memory usage: 362.1 KB


In [5]:
df_mean_sector = {}
for sector_name, sector_df in df_concat_sector.items():
    df_mean = sector_df.groupby(['일자'],as_index=False).mean()
    df_mean_sector[sector_name] = df_mean

In [6]:
df_mean_sector['Food']

Unnamed: 0,일자,종가,거래량,시가,고가,저가
0,2010-05-26,137725.0,59619.50,135875.0,138775.0,133187.5
1,2010-05-27,137512.5,83845.25,136000.0,138975.0,134537.5
2,2010-05-28,137187.5,52621.50,137375.0,138475.0,135525.0
3,2010-05-31,138437.5,41458.75,139500.0,140500.0,136825.0
4,2010-06-01,138275.0,35789.25,138062.5,139637.5,137012.5
...,...,...,...,...,...,...
2995,2022-07-15,196960.0,77016.40,198000.0,199400.0,191290.0
2996,2022-07-18,194420.0,77103.80,196750.0,198050.0,193440.0
2997,2022-07-19,199260.0,66513.00,194930.0,199730.0,193480.0
2998,2022-07-20,195960.0,83343.60,199110.0,200050.0,195140.0


# 7. 타입 변환

In [7]:
for sector_name, df in df_mean_sector.items():
    df.set_index('일자', inplace = True)
    df_mean_sector[sector_name] = df.astype('int')

In [8]:
df_mean_sector['Food']

Unnamed: 0_level_0,종가,거래량,시가,고가,저가
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-05-26,137725,59619,135875,138775,133187
2010-05-27,137512,83845,136000,138975,134537
2010-05-28,137187,52621,137375,138475,135525
2010-05-31,138437,41458,139500,140500,136825
2010-06-01,138275,35789,138062,139637,137012
...,...,...,...,...,...
2022-07-15,196960,77016,198000,199400,191290
2022-07-18,194420,77103,196750,198050,193440
2022-07-19,199260,66513,194930,199730,193480
2022-07-20,195960,83343,199110,200050,195140


# 8. 정규화

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


df_scaled = {}
#스케일을 적용할 column을 정의합니다.
scale_cols = ['거래량','시가','고가', '저가', '종가']
for sector_name, df in df_mean_sector.items():
    scaler = MinMaxScaler()
    scaled=scaler.fit_transform(df[scale_cols])
    df_scaled[sector_name] = pd.DataFrame(scaled,index = df.index,columns=scale_cols)
    

In [10]:
df_scaled['Food']

Unnamed: 0_level_0,거래량,시가,고가,저가,종가
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-05-26,0.042257,0.056702,0.043803,0.081957,0.085369
2010-05-27,0.063952,0.057396,0.044945,0.089734,0.084190
2010-05-28,0.035991,0.065028,0.042088,0.095426,0.082391
2010-05-31,0.025994,0.076824,0.053660,0.102915,0.089309
2010-06-01,0.020917,0.068842,0.048728,0.103992,0.088413
...,...,...,...,...,...
2022-07-15,0.057837,0.401550,0.390251,0.416671,0.413201
2022-07-18,0.057914,0.394611,0.382536,0.429057,0.399143
2022-07-19,0.048431,0.384509,0.392137,0.429287,0.425930
2022-07-20,0.063502,0.407711,0.393965,0.438850,0.407666


In [11]:
with open('df_scaled_UTD.pickle','wb') as f:
    pickle.dump(df_scaled,f)