In [1]:
import time
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# 1. 데이터 로드

In [2]:
import pickle
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)

# 2. 업종별 데이터프레임 합치기

In [3]:
df_concat_sector = {}
for sector_name, df_dict in df_sector.items():
    sector_df = pd.concat(df_dict,axis=0)
    sector_df.reset_index(drop=True, inplace=True)
    df_concat_sector[sector_name] = sector_df

In [4]:
df_concat_sector['Food'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13241 entries, 0 to 13240
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   일자      13241 non-null  datetime64[ns]
 1   종가      13241 non-null  int32         
 2   거래량     13241 non-null  int32         
 3   시가      13241 non-null  int32         
 4   고가      13241 non-null  int32         
 5   저가      13241 non-null  int32         
dtypes: datetime64[ns](1), int32(5)
memory usage: 362.2 KB


In [5]:
df_mean_sector = {}
for sector_name, sector_df in df_concat_sector.items():
    df_mean = sector_df.groupby(['일자'],as_index=False).mean()
    df_mean_sector[sector_name] = df_mean

In [6]:
df_mean_sector['Food']

Unnamed: 0,일자,종가,거래량,시가,고가,저가
0,2010-05-20,138462.5,39044.75,137987.5,140600.0,136912.5
1,2010-05-24,137662.5,33874.25,137900.0,140187.5,136800.0
2,2010-05-25,136062.5,56176.25,137025.0,138875.0,132250.0
3,2010-05-26,137725.0,59619.50,135875.0,138775.0,133187.5
4,2010-05-27,137512.5,83845.25,136000.0,138975.0,134537.5
...,...,...,...,...,...,...
2996,2022-07-13,200380.0,95481.40,198380.0,203840.0,196450.0
2997,2022-07-14,198270.0,84045.20,198500.0,201570.0,195910.0
2998,2022-07-15,196960.0,77016.40,198000.0,199400.0,191290.0
2999,2022-07-18,194420.0,77103.80,196750.0,198050.0,193440.0


# 7. 타입 변환

In [7]:
for sector_name, df in df_mean_sector.items():
    df.set_index('일자', inplace = True)
    df_mean_sector[sector_name] = df.astype('int')

In [8]:
df_mean_sector['Food']

Unnamed: 0_level_0,종가,거래량,시가,고가,저가
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-05-20,138462,39044,137987,140600,136912
2010-05-24,137662,33874,137900,140187,136800
2010-05-25,136062,56176,137025,138875,132250
2010-05-26,137725,59619,135875,138775,133187
2010-05-27,137512,83845,136000,138975,134537
...,...,...,...,...,...
2022-07-13,200380,95481,198380,203840,196450
2022-07-14,198270,84045,198500,201570,195910
2022-07-15,196960,77016,198000,199400,191290
2022-07-18,194420,77103,196750,198050,193440


# 8. 정규화

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


df_scaled = {}
#스케일을 적용할 column을 정의합니다.
scale_cols = ['거래량','시가','고가', '저가', '종가']
for sector_name, df in df_mean_sector.items():
    scaler = MinMaxScaler()
    scaled=scaler.fit_transform(df[scale_cols])
    df_scaled[sector_name] = pd.DataFrame(scaled,index = df.index,columns=scale_cols)
    

In [10]:
df_scaled['Food']

Unnamed: 0_level_0,거래량,시가,고가,저가,종가
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-05-20,0.023832,0.068426,0.054232,0.103416,0.089447
2010-05-24,0.019202,0.067943,0.051872,0.102771,0.085020
2010-05-25,0.039174,0.063086,0.044374,0.076560,0.076165
2010-05-26,0.042257,0.056702,0.043803,0.081957,0.085369
2010-05-27,0.063952,0.057396,0.044945,0.089734,0.084190
...,...,...,...,...,...
2022-07-13,0.074372,0.403659,0.415624,0.446397,0.432128
2022-07-14,0.064131,0.404325,0.402652,0.443286,0.420451
2022-07-15,0.057837,0.401550,0.390251,0.416671,0.413201
2022-07-18,0.057914,0.394611,0.382536,0.429057,0.399143


In [11]:
with open('df_scaled_UTD.pickle','wb') as f:
    pickle.dump(df_scaled,f)