In [1]:
import time
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# 데이터 로드

In [2]:
import pickle
with open('df_sector.pickle','rb') as f:
    df_sector = pickle.load(f)
with open('kospi.pickle','rb') as f:
    kospi = pickle.load(f)

# 1. 등락률 계산
# 2. 거래변동량 계산

In [3]:
#등락률 계산 : 일 등락률 = (오늘종가 – 어제종가) / 어제종가 * 100 => 평균
#등락률
for sector_name, df_dict in df_sector.items():
    for stockcode, df in df_dict.items():
        등락률 = []
        거래변동량 = []
        for i in range(len(df)): #현재가 수
            if i == len(df)-1: #각 항목의 마지막 인덱스인경우
                break
            else:    
                등락률.append((df['종가'][i]-df['종가'][i+1])/df['종가'][i+1]*100)
                거래변동량.append(abs((df['거래량'][i]-df['거래량'][i+1])))
        등락률.append(0)
        거래변동량.append(0)
        df['등락률']=등락률
        df['거래변동량']=거래변동량

# 3. 종가data 정규화

In [4]:
for sector_name, df_dict in df_sector.items():
    for stockcode, df in df_dict.items():
        scaler = MinMaxScaler()
        scaler.fit(pd.DataFrame(data = df['종가']))
        df_scaled = scaler.transform(pd.DataFrame(data = df['종가']))
        df['정규화_종가']=pd.DataFrame(data = df_scaled)

# 4. 데이터가 부족한 항목 조회

In [5]:
lack_sector_list={}
for sector_name, df_dict in df_sector.items():
    lack_stock_list = []
    for stockcode, df in df_dict.items():
        if len(df)<3000:
            lack_stock_list.append(stockcode)
    lack_sector_list[sector_name] = lack_stock_list
print(lack_sector_list)
            
            
            

{'Food': ['271560'], 'Clothing': ['383220'], 'Chemical': [], 'Medicine': ['207940', '302440', '128940'], 'Non_Metal': ['300720'], 'Metal': [], 'Machine': ['241560', '112610'], 'Electronic': ['373220'], 'Construction': [], 'Transport': ['180640'], 'Distribution': ['028260', '282330', '139480'], 'Power': [], 'Tele': [], 'Finance': ['323410', '377300'], 'Brokerage': [], 'Insurer': [], 'Service': ['259960', '018260'], 'Manufacturer': ['329180']}


In [6]:
df_concat_sector = {}
for sector_name, df_dict in df_sector.items():
    sector_df = pd.concat(df_dict,axis=0)
    sector_df.reset_index(drop=True, inplace=True)
    df_concat_sector[sector_name] = sector_df
    

In [7]:
df_concat_sector['Food'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13230 entries, 0 to 13229
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   일자      13230 non-null  datetime64[ns]
 1   거래량     13230 non-null  int32         
 2   종가      13230 non-null  int32         
 3   등락률     13230 non-null  float64       
 4   거래변동량   13230 non-null  int64         
 5   정규화_종가  13230 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(2), int64(1)
memory usage: 516.9 KB


In [8]:
df_sector['Food']

{'097950':              일자     거래량      종가       등락률  거래변동량    정규화_종가
 0    2022-07-08   26223  389000 -0.765306   4713  0.693333
 1    2022-07-07   21510  392000 -0.254453  11143  0.702222
 2    2022-07-06   32653  393000 -1.132075   2907  0.705185
 3    2022-07-05   35560  397500 -0.749064  24183  0.718519
 4    2022-07-04   59743  400500  2.168367   2056  0.727407
 ...         ...     ...     ...       ...    ...       ...
 2995 2010-05-18   93284  220000  5.263158  21209  0.192593
 2996 2010-05-17   72075  209000  0.480769  63428  0.160000
 2997 2010-05-14  135503  208000 -4.367816  76719  0.157037
 2998 2010-05-13   58784  217500  0.230415  16317  0.185185
 2999 2010-05-12   75101  217000  0.000000      0  0.183704
 
 [3000 rows x 6 columns],
 '271560':              일자      거래량      종가       등락률   거래변동량    정규화_종가
 0    2022-07-08   118773  106000 -1.395349   10625  0.345912
 1    2022-07-07   129398  107500  2.380952   36373  0.364780
 2    2022-07-06   165771  105000  0.000000   

In [9]:
df_mean_sector = {}
for sector_name, sector_df in df_concat_sector.items():
    df_mean = sector_df.groupby(['일자'],as_index=False).mean()
    df_mean_sector[sector_name] = df_mean
    

In [10]:
df_mean_sector['Food']

Unnamed: 0,일자,거래량,종가,등락률,거래변동량,정규화_종가
0,2010-05-12,37377.50,139200.0,0.000000,0.00,0.224956
1,2010-05-13,39789.50,139800.0,0.352270,11989.00,0.225829
2,2010-05-14,56946.75,135837.5,-2.345684,23239.25,0.210127
3,2010-05-17,51829.75,136225.0,-0.473023,26597.00,0.203685
4,2010-05-18,93281.00,140650.0,1.299365,43176.25,0.207569
...,...,...,...,...,...,...
2995,2022-07-04,119710.20,199030.0,0.332322,29208.60,0.464094
2996,2022-07-05,119137.60,198250.0,-0.642934,47426.20,0.455375
2997,2022-07-06,104889.40,197660.0,0.165308,24833.00,0.455939
2998,2022-07-07,63088.40,197790.0,0.501700,41801.00,0.462412


# 섹션별 데이터와 코스피 지수 합치기

In [12]:
result = {}
for sector_name, df in df_mean_sector.items():
    result[sector_name] =pd.merge(df,kospi,on='일자',how='left')
    

In [17]:
with open('result.pickle','wb') as f:
    pickle.dump(result,f)