# 1. 환경설정

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import time
import pickle
from tqdm import tqdm

In [2]:
import pickle
with open('sector.pickle','rb') as f:
    sector = pickle.load(f)

# 2. 일별 종가 데이터 수집 - 10개
키움API는 구름IDE에서 실행이 안되어 따로 웹크롤링

In [3]:
def get_stock_price(code, num_of_pages):
    url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" 
    bs = BeautifulSoup(requests.get(url=url, headers = headers).text, 'html.parser')
    pgrr = bs.find("td", class_="pgRR")
    last_page = int(pgrr.a["href"].split('=')[-1])
    pages = min(last_page, num_of_pages) # 마지막 페이지와 가져올 페이지 수 중에 작은 값 선택
    df = pd.DataFrame()

    for page in range(1, pages+1):
        page_url = '{}&page={}'.format(url, page)
        df = df.append(pd.read_html(requests.get(page_url, headers={'User-agent': 'Mozilla/5.0'}).text)[0])
        
    df['일자'] = pd.to_datetime(df['날짜']) 
    df = df.dropna()
    df[['종가',  '시가', '고가', '저가', '거래량']] = df[['종가', '시가', '고가', '저가', '거래량']].astype(int) # int형으로 변경
    df = df[['일자', '거래량','시가', '고가', '저가','종가']] # 컬럼 순서 정렬
    df = df.sort_values(by = '일자') # 날짜순으로 정렬
    df = df.loc[::-1].reset_index(drop=True)
    return df

In [4]:
headers = {'User-agent': 'Mozilla/5.0'}
pages = 1
df_sector_UTD = {}
for sector_name in sector:
    df_sector_UTD[sector_name] = {}


In [5]:
for sector_name, code_list in sector.items():
    for stock_code in tqdm(code_list):
        df_sector_UTD[sector_name][stock_code] = get_stock_price(stock_code,pages)
        time.sleep(0.01)

100%|██████████| 5/5 [00:02<00:00,  2.45it/s]
100%|██████████| 5/5 [00:02<00:00,  2.50it/s]
100%|██████████| 5/5 [00:01<00:00,  2.60it/s]
100%|██████████| 5/5 [00:01<00:00,  2.52it/s]
100%|██████████| 4/4 [00:01<00:00,  2.48it/s]
100%|██████████| 5/5 [00:02<00:00,  2.38it/s]
100%|██████████| 4/4 [00:01<00:00,  2.52it/s]
100%|██████████| 5/5 [00:01<00:00,  2.61it/s]
100%|██████████| 4/4 [00:01<00:00,  2.52it/s]
100%|██████████| 5/5 [00:02<00:00,  2.49it/s]
100%|██████████| 5/5 [00:01<00:00,  2.51it/s]
100%|██████████| 3/3 [00:01<00:00,  2.52it/s]
100%|██████████| 3/3 [00:01<00:00,  2.50it/s]
100%|██████████| 5/5 [00:01<00:00,  2.52it/s]
100%|██████████| 4/4 [00:01<00:00,  2.60it/s]
100%|██████████| 4/4 [00:01<00:00,  2.63it/s]
100%|██████████| 5/5 [00:02<00:00,  2.39it/s]
100%|██████████| 6/6 [00:02<00:00,  2.38it/s]


# 이전 data와 최신 data 병합(최신화)

In [6]:
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)

In [7]:
for sector_name, df_dict in df_sector.items():
    for stock_code, df in df_dict.items():
        df = pd.concat([df_sector_UTD[sector_name][stock_code],df_sector[sector_name][stock_code]],axis=0)
        df=df.drop_duplicates(subset=['일자'])
        df = df.reset_index(drop=True)
        df_sector[sector_name][stock_code] = df

In [8]:
df_sector['Food']['097950']

Unnamed: 0,일자,거래량,시가,고가,저가,종가
0,2022-08-11,66360,425000,425000,417000,421500
1,2022-08-10,66726,426500,428500,414500,423500
2,2022-08-09,265521,402000,437000,402000,428500
3,2022-08-08,28337,389000,394500,386000,394500
4,2022-08-05,26817,386500,391000,385000,389000
...,...,...,...,...,...,...
3010,2010-06-01,31617,214000,215000,210000,211000
3011,2010-05-31,30990,215000,215000,211500,214000
3012,2010-05-28,64458,211500,213000,208000,213000
3013,2010-05-27,46323,208500,212000,205000,211500


In [9]:
with open('df_sector_UTD.pickle','wb') as f:
    pickle.dump(df_sector,f)