In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='Malgun Gothic')
rc('axes', unicode_minus=False)

In [2]:
import datetime
import time
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import json
from tqdm import tqdm

In [3]:
date_today = datetime.date.today()
date_key = date_today.strftime('%y%m%d')

## 종목코드

In [4]:
# KOSPI only

In [5]:
market_code = 'stockMkt'
# 'kospi': 'stockMkt',
# 'kosdaq': 'kosdaqMkt',
# 'konex': 'konexMkt'

In [6]:
sto_code_url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=' + market_code

In [7]:
sto_code_df = pd.read_html(sto_code_url)[0]
sto_code_df[:5]

Unnamed: 0,회사명,종목코드,업종,주요제품,상장일,결산월,대표자명,홈페이지,지역
0,JS전선,5560,절연선 및 케이블 제조업,"선박선,고무선,전력선,통신선 제조",2007-11-12,12월,이익희,http://www.jscable.co.kr,충청남도
1,거북선2호,101380,,운송장비(선박) 임대,2008-04-25,12월,신주선,,부산광역시
2,거북선6호,114140,,,2009-10-01,12월,김연신,,제주특별자치도
3,교보메리츠,64900,,"부동산 투자,운용",2002-01-30,12월,김 상 진,,서울특별시
4,국제관광공사,28780,,,1966-03-18,12월,,,


In [8]:
# save path, file_name

In [9]:
save_n_code_df = ('data/code_list/' + 'code_list_' + market_code + '_' + date_key + '.csv').strip()
save_n_code_df

'data/code_list/code_list_stockMkt_190816.csv'

In [10]:
# save

In [11]:
sto_code_df.to_csv(save_n_code_df, encoding='euc-kr', index=False) #cp949
print('save >> ' + save_n_code_df)

save >> data/code_list/code_list_stockMkt_190816.csv


In [12]:
# check

In [13]:
os.listdir('data/code_list')

['code_list_190728.csv',
 'code_list_190729.csv',
 'code_list_190730.csv',
 'code_list_stockMkt_190730.csv',
 'code_list_stockMkt_190801.csv',
 'code_list_stockMkt_190814.csv',
 'code_list_stockMkt_190815.csv',
 'code_list_stockMkt_190816.csv']

## 주요 지표

In [14]:
market = ['국내', '해외', '환율', 'WTI', '휘발유', '국제금', '국내금']

In [15]:
def url_creater(what) :
    target_url = []
    target_code = []
    url_base = 'https://finance.naver.com/'
    url_ref = {'국내' : 'sise/sise_index_day.nhn?code=', 
              '해외' : 'world/worldDayListJson.nhn?symbol=', 
              '환율' : 'marketindex/exchangeDailyQuote.nhn?marketindexCd=FX_', 
              'WTI' : 'marketindex/worldDailyQuote.nhn?marketindexCd=', 
              '휘발유' : 'marketindex/oilDailyQuote.nhn?marketindexCd=', 
              '국제금' : 'marketindex/worldDailyQuote.nhn?marketindexCd=', 
              '국내금' : 'marketindex/goldDailyQuote.nhn?'}
    cate_code = {'국내' : ['KOSPI', 'KOSDAQ', 'KPI200'], 
              '해외' : ['DJI@DJI', 'NII@NI225', 'LNS@FTSE100', 
                      'NAS@IXIC', 'SHS@000001', 'PAS@CAC40', 
                      'SPI@SPX', 'HSI@HSI', 'XTR@DAX30' ], 
              '환율' : ['USDKRW', 'JPYKRW', 'EURKRW', 'CNYKRW'], 
              'WTI' : ['OIL_CL'], 
              '휘발유' : ['OIL_GSL'], 
              '국제금' : ['CMDT_GC'], 
              '국내금' : ['']}
    pager = {'국내' : '&page=', 
              '해외' : '&fdtc=0&page=', 
              '환율' : '&page=', 
              'WTI' : '&fdtc=2&page=', 
              '휘발유' : '&page=', 
              '국제금' : '&fdtc=2&page=', 
              '국내금' : '&page='}
    for i in range(len(cate_code[what])) :
        target_url += [url_base + url_ref[what] + cate_code[what][i] + pager[what]]
        target_code += [cate_code[what][i]]
    return target_url, target_code

In [16]:
def last_page_finder(url):
    target_html = BeautifulSoup(urlopen(url).read(), 'lxml') #html.parser
    page_pkg = target_html.find_all("table") #,align="center"
    if page_pkg ==[] : return 500
    else :
        last_page_pkg = page_pkg[0].find_all("td",class_="pgRR")
        if last_page_pkg ==[] : 
            try :
                last_page_pkg = page_pkg[1].find_all("td",class_="pgRR")
            except IndexError :
                return 500
        last_page_pkg = last_page_pkg[0].a.get('href')
        last_page_num = int(last_page_pkg.split("=")[-1].strip())
        return last_page_num

In [22]:
def crawl_html(last_page_num, i) :
    temp_df = pd.DataFrame()
    for j in range(1, last_page_num+1) :
        page_url = (f'{i}{j}')
        temp_df = temp_df.append(pd.read_html(page_url, header=0)[0],
                                 ignore_index=True)
    return temp_df

In [23]:
def crawl_ajax(last_page_num, i) :
    temp_df = pd.DataFrame()
    exp_cols = {"xymd":"날짜","clos":'종가',"diff":'전일비',
            "open":'시가',"high":'고가',"low":'저가',
            "rate":"rate","gvol":"gvol", "symb":"symb"}
    for j in range(1, last_page_num+1) :
        page_url = (f'{i}{j}')
        temp_df = temp_df.append(pd.DataFrame(requests.get(page_url).json()), 
                                 ignore_index=True)
    if 'xymd' in temp_df.columns :
        temp_df.rename(columns=exp_cols, inplace=True)
    return temp_df

In [46]:
def info_collector(market) :
    info_df = pd.DataFrame()
    target_url = url_creater(market)
    k = -1
    for i in tqdm(target_url[0], desc='collector', mininterval=1) :
        k += 1
        last_page_num = last_page_finder(i+'1')
        try :
            temp_df = crawl_html(last_page_num, i)
        except ValueError :
            temp_df = crawl_ajax(last_page_num, i)      
        temp_df = temp_df.dropna(axis=0, how='all')
        temp_df = temp_df.drop_duplicates(['날짜'])
        temp_df.set_index('날짜', inplace=True)
        temp_df.columns = (temp_df.columns + '_' + target_url[1][k])
        info_df = pd.concat([info_df, temp_df], axis=1, sort=False)
    info_df = info_df.drop('날짜', axis=0, errors='ignore')
    info_df.index.names = ['날짜']
    return info_df

In [47]:
# run

In [48]:
start = time.time()

for i in tqdm(market) :
    market_info_df = info_collector(i)
    save_n_market_info = (
        'data/market_info/' + 'market_info_' + i + '_' + date_key + '.csv').strip()
    market_info_df.to_csv(save_n_market_info, encoding='euc-kr', index=True)
    print('save >> ' + save_n_market_info)

print()
print(f'실행시간: {time.time() - start :.2f} sec')






  0%|          | 0/7 [00:00<?, ?it/s]





collector:   0%|          | 0/3 [00:00<?, ?it/s]





collector:  33%|███▎      | 1/3 [02:32<05:05, 152.67s/it]





collector:  67%|██████▋   | 2/3 [04:07<02:15, 135.46s/it]





collector: 100%|██████████| 3/3 [04:53<00:00, 108.40s/it]

save >> data/market_info/market_info_국내_190816.csv







 14%|█▍        | 1/7 [04:53<29:20, 293.37s/it]





collector:   0%|          | 0/9 [00:00<?, ?it/s]





collector:  11%|█         | 1/9 [00:41<05:28, 41.06s/it]





collector:  22%|██▏       | 2/9 [01:26<04:56, 42.36s/it]





collector:  33%|███▎      | 3/9 [02:08<04:14, 42.38s/it]





collector:  44%|████▍     | 4/9 [02:51<03:32, 42.51s/it]





collector:  56%|█████▌    | 5/9 [03:34<02:49, 42.47s/it]





collector:  67%|██████▋   | 6/9 [04:17<02:07, 42.61s/it]





collector:  78%|███████▊  | 7/9 [05:20<01:37, 48.90s/it]





collector:  89%|████████▉ | 8/9 [06:02<00:46, 46.89s/it]





collector: 100%|██████████| 9/9 [06:45<00:00, 45.61s/it]

save >> data/market_info/market_info_해외_190816.csv







 29%|██▊       | 2/7 [11:39<27:15, 327.14s/it]





collector:   0%|          | 0/4 [00:00<?, ?it/s]





collector:  25%|██▌       | 1/4 [00:40<02:02, 40.68s/it]





collector:  50%|█████     | 2/4 [01:20<01:21, 40.54s/it]





collector:  75%|███████▌  | 3/4 [01:59<00:40, 40.01s/it]





collector: 100%|██████████| 4/4 [02:35<00:00, 38.82s/it]

save >> data/market_info/market_info_환율_190816.csv







 43%|████▎     | 3/7 [14:15<18:22, 275.73s/it]





collector:   0%|          | 0/1 [00:00<?, ?it/s]





collector: 100%|██████████| 1/1 [00:32<00:00, 32.90s/it]

save >> data/market_info/market_info_WTI_190816.csv







 57%|█████▋    | 4/7 [14:48<10:08, 202.89s/it]





collector:   0%|          | 0/1 [00:00<?, ?it/s]





collector: 100%|██████████| 1/1 [00:33<00:00, 33.15s/it]

save >> data/market_info/market_info_휘발유_190816.csv







 71%|███████▏  | 5/7 [15:21<05:03, 151.98s/it]





collector:   0%|          | 0/1 [00:00<?, ?it/s]





collector: 100%|██████████| 1/1 [00:36<00:00, 36.44s/it]

save >> data/market_info/market_info_국제금_190816.csv







 86%|████████▌ | 6/7 [15:57<01:57, 117.33s/it]





collector:   0%|          | 0/1 [00:00<?, ?it/s]





collector: 100%|██████████| 1/1 [01:13<00:00, 73.81s/it]

save >> data/market_info/market_info_국내금_190816.csv







100%|██████████| 7/7 [17:11<00:00, 104.29s/it]


실행시간: 1031.56 sec


In [23]:
# check

In [49]:
os.listdir('data/market_info')

['market_info_WTI_190815.csv',
 'market_info_WTI_190815_update.csv',
 'market_info_WTI_190816.csv',
 'market_info_국내_190815.csv',
 'market_info_국내_190815_update.csv',
 'market_info_국내_190816.csv',
 'market_info_국내금_190815.csv',
 'market_info_국내금_190815_update.csv',
 'market_info_국내금_190816.csv',
 'market_info_국제금_190815.csv',
 'market_info_국제금_190815_update.csv',
 'market_info_국제금_190816.csv',
 'market_info_해외_190815.csv',
 'market_info_해외_190815_update.csv',
 'market_info_해외_190816.csv',
 'market_info_환율_190815.csv',
 'market_info_환율_190815_update.csv',
 'market_info_환율_190816.csv',
 'market_info_휘발유_190815.csv',
 'market_info_휘발유_190815_update.csv',
 'market_info_휘발유_190816.csv']