In [1]:
import time
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# 데이터 로드

In [2]:
import pickle
with open('df_sector.pickle','rb') as f:
    df_sector = pickle.load(f)
with open('kospi.pickle','rb') as f:
    kospi = pickle.load(f)

# 1. 등락률 계산
# 2. 거래변동량 계산

In [3]:
#등락률 계산 : 일 등락률 = (오늘종가 – 어제종가) / 어제종가 * 100 => 평균
#등락률
for sector_name, df_dict in df_sector.items():
    for stockcode, df in df_dict.items():
        등락률 = []
        거래변동량 = []
        for i in range(len(df)): #현재가 수
            if i == len(df)-1: #각 항목의 마지막 인덱스인경우
                break
            else:    
                등락률.append((df['종가'][i]-df['종가'][i+1])/df['종가'][i+1]*100)
                거래변동량.append(abs((df['거래량'][i]-df['거래량'][i+1])))
        등락률.append(0)
        거래변동량.append(0)
        df['등락률']=등락률
        df['거래변동량']=거래변동량
        del df['거래량']

# 3. 종가data 정규화

In [4]:
for sector_name, df_dict in df_sector.items():
    for stockcode, df in df_dict.items():
        scaler = MinMaxScaler()
        scaler.fit(pd.DataFrame(data = df['종가']))
        df_scaled = scaler.transform(pd.DataFrame(data = df['종가']))
        df['정규화_종가']=pd.DataFrame(data = df_scaled)
        del df['종가']

# 4. 데이터가 부족한 항목 조회

In [5]:
lack_sector_list={}
for sector_name, df_dict in df_sector.items():
    lack_stock_list = []
    for stockcode, df in df_dict.items():
        if len(df)<3000:
            lack_stock_list.append(stockcode)
    lack_sector_list[sector_name] = lack_stock_list
print(lack_sector_list)
            
            
            

{'Food': ['271560'], 'Clothing': ['383220'], 'Chemical': [], 'Medicine': ['207940', '302440', '128940'], 'Non_Metal': ['300720'], 'Metal': [], 'Machine': ['241560', '112610'], 'Electronic': ['373220'], 'Construction': [], 'Transport': ['180640'], 'Distribution': ['028260', '282330', '139480'], 'Power': [], 'Tele': [], 'Finance': ['323410', '377300'], 'Brokerage': [], 'Insurer': [], 'Service': ['259960', '018260'], 'Manufacturer': ['329180']}


In [6]:
df_concat_sector = {}
for sector_name, df_dict in df_sector.items():
    sector_df = pd.concat(df_dict,axis=0)
    sector_df.reset_index(drop=True, inplace=True)
    df_concat_sector[sector_name] = sector_df
    

In [7]:
df_concat_sector['Food'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13230 entries, 0 to 13229
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   일자      13230 non-null  datetime64[ns]
 1   등락률     13230 non-null  float64       
 2   거래변동량   13230 non-null  int64         
 3   정규화_종가  13230 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 413.5 KB


In [8]:
df_mean_sector = {}
for sector_name, sector_df in df_concat_sector.items():
    df_mean = sector_df.groupby(['일자'],as_index=False).mean()
    df_mean_sector[sector_name] = df_mean
    

In [9]:
df_mean_sector['Food']

Unnamed: 0,일자,등락률,거래변동량,정규화_종가
0,2010-05-12,0.000000,0.00,0.224956
1,2010-05-13,0.352270,11989.00,0.225829
2,2010-05-14,-2.345684,23239.25,0.210127
3,2010-05-17,-0.473023,26597.00,0.203685
4,2010-05-18,1.299365,43176.25,0.207569
...,...,...,...,...
2995,2022-07-04,0.332322,29208.60,0.464094
2996,2022-07-05,-0.642934,47426.20,0.455375
2997,2022-07-06,0.165308,24833.00,0.455939
2998,2022-07-07,0.501700,41801.00,0.462412


# 5. 코스피 증감량 계산

In [10]:
코스피지수_증감량 = []
for i in range(len(kospi)):
    if i == len(kospi)-1:
        break
    else:    
        코스피지수_증감량.append((kospi['코스피지수'][i]-kospi['코스피지수'][i+1]))
코스피지수_증감량.append(0)
kospi['코스피지수_증감량']=코스피지수_증감량
del(kospi['코스피지수'])
kospi

Unnamed: 0,일자,코스피지수_증감량
0,2022-07-08,16.34
1,2022-07-07,42.26
2,2022-07-06,-49.77
3,2022-07-05,41.44
4,2022-07-04,-5.08
...,...,...
2995,2010-05-18,-8.27
2996,2010-05-17,-44.12
2997,2010-05-14,1.05
2998,2010-05-13,31.55


# 6. 섹션별 데이터와 코스피 지수 합치기

In [11]:
result = {}
for sector_name, df in df_mean_sector.items():
    result[sector_name] =pd.merge(df,kospi,on='일자',how='left')
    

In [12]:
result['Food']

Unnamed: 0,일자,등락률,거래변동량,정규화_종가,코스피지수_증감량
0,2010-05-12,0.000000,0.00,0.224956,0.00
1,2010-05-13,0.352270,11989.00,0.225829,31.55
2,2010-05-14,-2.345684,23239.25,0.210127,1.05
3,2010-05-17,-0.473023,26597.00,0.203685,-44.12
4,2010-05-18,1.299365,43176.25,0.207569,-8.27
...,...,...,...,...,...
2995,2022-07-04,0.332322,29208.60,0.464094,-5.08
2996,2022-07-05,-0.642934,47426.20,0.455375,41.44
2997,2022-07-06,0.165308,24833.00,0.455939,-49.77
2998,2022-07-07,0.501700,41801.00,0.462412,42.26


In [13]:
# 7. 데이터프레임 반올림

In [14]:
for sector_name, df in result.items():
    df['정규화_종가'] = df['정규화_종가'].round(4)
    df['등락률'] = df['등락률'].round(4)
    df['거래변동량'] =df['거래변동량'].astype(int)

    

In [15]:
result['Food']

Unnamed: 0,일자,등락률,거래변동량,정규화_종가,코스피지수_증감량
0,2010-05-12,0.0000,0,0.2250,0.00
1,2010-05-13,0.3523,11989,0.2258,31.55
2,2010-05-14,-2.3457,23239,0.2101,1.05
3,2010-05-17,-0.4730,26597,0.2037,-44.12
4,2010-05-18,1.2994,43176,0.2076,-8.27
...,...,...,...,...,...
2995,2022-07-04,0.3323,29208,0.4641,-5.08
2996,2022-07-05,-0.6429,47426,0.4554,41.44
2997,2022-07-06,0.1653,24833,0.4559,-49.77
2998,2022-07-07,0.5017,41801,0.4624,42.26


In [16]:
with open('result.pickle','wb') as f:
    pickle.dump(result,f)