# 1. 환경설정

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import time
import pickle
from tqdm import tqdm

In [3]:
import pickle
with open('certificated_stock_dic.pickle','rb') as f:
    certificated_stock_dic = pickle.load(f)

In [4]:
certificated_stock_dic

{'Food': ['097950'],
 'Clothing': [],
 'Chemical': ['090430'],
 'Medicine': [],
 'Non_Metal': ['003410'],
 'Metal': ['010130'],
 'Machine': [],
 'Electronic': [],
 'Construction': ['047040'],
 'Transport': ['028670'],
 'Distribution': ['028260'],
 'Power': [],
 'Tele': ['032640'],
 'Finance': ['006800'],
 'Brokerage': ['005940', '008560'],
 'Insurer': [],
 'Service': [],
 'Manufacturer': ['012330']}

# 2. 일별 종가 데이터 수집 - 10개


In [4]:
def get_stock_price(code, num_of_pages):
    url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" 
    bs = BeautifulSoup(requests.get(url=url, headers = headers).text, 'html.parser')
    pgrr = bs.find("td", class_="pgRR")
    last_page = int(pgrr.a["href"].split('=')[-1])
    pages = min(last_page, num_of_pages) # 마지막 페이지와 가져올 페이지 수 중에 작은 값 선택
    df = pd.DataFrame()

    for page in range(1, pages+1):
        page_url = '{}&page={}'.format(url, page)
        df = df.append(pd.read_html(requests.get(page_url, headers={'User-agent': 'Mozilla/5.0'}).text)[0])
        
    df['일자'] = pd.to_datetime(df['날짜']) 
    df = df.dropna()
    df[['종가',  '시가', '고가', '저가', '거래량']] = df[['종가', '시가', '고가', '저가', '거래량']].astype(int) # int형으로 변경
    df = df[['일자', '거래량','시가', '고가', '저가','종가']] # 컬럼 순서 정렬
    df = df.sort_values(by = '일자') # 날짜순으로 정렬
    df = df.loc[::-1].reset_index(drop=True)
    return df

In [6]:
headers = {'User-agent': 'Mozilla/5.0'}
pages = 1
df_sector_UTD = {}
for sector_name in certificated_stock_dic:
    df_sector_UTD[sector_name] = {}


In [8]:
for sector_name, code_list in certificated_stock_dic.items():
    for stock_code in tqdm(code_list):
        df_sector_UTD[sector_name][stock_code] = get_stock_price(stock_code,pages)
        time.sleep(0.01)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.90it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.17it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.46it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.78it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████

In [9]:
df_sector_UTD

{'Food': {'097950':           일자    거래량      시가      고가      저가      종가
  0 2022-08-05   2722  386500  389000  385500  386500
  1 2022-08-04  24225  388000  389500  383500  384500
  2 2022-08-03  30267  385000  387500  383500  387500
  3 2022-08-02  29317  387000  388000  383000  385500
  4 2022-08-01  37653  391500  392000  383500  387000
  5 2022-07-29  27618  397500  397500  389000  391500
  6 2022-07-28  25944  395500  398000  391000  395000
  7 2022-07-27  16778  396500  400000  394000  395000
  8 2022-07-26  19443  395000  400500  395000  397000
  9 2022-07-25  15932  394500  398000  393500  395000},
 'Clothing': {},
 'Chemical': {'090430':           일자     거래량      시가      고가      저가      종가
  0 2022-08-05   21005  127500  128500  127000  128000
  1 2022-08-04  196672  128000  129000  126500  127500
  2 2022-08-03  228973  126500  129000  126500  127500
  3 2022-08-02  142065  127500  128000  126000  127000
  4 2022-08-01  168532  129000  129000  127500  128000
  5 2022-07-29  5

# 이전 data와 최신 data 병합(최신화)

In [10]:
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)

In [12]:
for sector_name, df_dict in df_sector_UTD.items():
    for stock_code, df in df_dict.items():
        df = pd.concat([df,df_sector[sector_name][stock_code]],axis=0)
        df=df.drop_duplicates(subset=['일자'])
        df = df.sort_values(by=df.columns[0],ascending=True)
        df = df.reset_index(drop=True)
        df_sector[sector_name][stock_code] = df

In [13]:
df_sector['Food']['097950']

Unnamed: 0,일자,거래량,시가,고가,저가,종가
0,2010-06-08,50727,211000,213500,209500,211000
1,2010-06-09,52375,212000,218000,210500,214000
2,2010-06-10,61195,214500,219500,213500,219000
3,2010-06-11,89684,217000,220000,211500,214500
4,2010-06-14,41941,216000,220000,215500,219000
...,...,...,...,...,...,...
2998,2022-08-01,37653,391500,392000,383500,387000
2999,2022-08-02,29317,387000,388000,383000,385500
3000,2022-08-03,30267,385000,387500,383500,387500
3001,2022-08-04,24225,388000,389500,383500,384500


In [14]:
with open('df_sector_UTD.pickle','wb') as f:
    pickle.dump(df_sector,f)