In [11]:
!pip install --upgrade html5lib==0.9999999
!pip install OpenDartReader
!pip install html_table_parser
!pip install fake_useragent
!pip install pyOpenSSL





In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [18]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from itertools import chain

import pandas as pd

import OpenDartReader
from datetime import datetime
import requests
import json
import re
import tensorflow as tf

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import warnings
warnings.filterwarnings(action='ignore')


# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
# user_agent = UserAgent()
USER_AGENT = UserAgent().random

api_key = '8d9237af3104aa5f00c9b8d50e067605c58245b7'
dart = OpenDartReader(api_key)

output_path = './dart_stock_noTable.json'


def dart_list(api_key, corp_code='', start=None, end=None, kind='', kind_detail='', final=False):
    start = pd.to_datetime(start) if start else pd.to_datetime('1900-01-01')
    end = pd.to_datetime(end) if end else datetime.today()
    
    url = 'https://opendart.fss.or.kr/api/list.json'
    params = {
        'crtfc_key': api_key,
        'corp_code': corp_code,
        'bgn_de': start.strftime('%Y%m%d'),
        'end_de': end.strftime('%Y%m%d'),
        'last_reprt_at': 'Y' if final else 'N', # 최종보고서 여부
        'page_no': 1,
        'page_count': 100,
    }
    if kind:
        params['pblntf_ty'] = kind # 공시유형: 기본값 'A'=정기공시
    if kind_detail:
        params['pblntf_detail_ty'] = kind_detail

    r = requests.get(url, params=params)

    jo = r.json()
    if 'list' not in jo:
        return pd.DataFrame()
    df = pd.DataFrame(jo['list'])

    # paging
    for page in range(2, jo['total_page']+1):
        params['page_no'] = page
        r = requests.get(url, params=params)
        jo = r.json()
        df = pd.concat([df, pd.DataFrame(jo['list'])])
    return df


def make_xml_text(rcept_no):
    try:
        xml_text = dart.document(rcept_no)
    except:
        xml_text = ""
    
    return xml_text


def make_text(rcept_no, run_table=False):
    
    xml_text = make_xml_text(rcept_no)
    soup = BeautifulSoup(xml_text.replace('&cr;', '&#13;'), features="lxml")
    
    for table in soup.find_all('table'):
        table.extract()
        
    return list(chain.from_iterable(["다.\n".join(" ".join(re.split(r'\s+', text)).split('다.')).split('\n') for texts in soup.find_all('p') for text in list(texts.stripped_strings)]))

    
def data_preprocessing(texts):

    texts = pd.Series([text.strip() for text in texts])

    # 불용단어, 불용어 제거
    texts = texts[~texts.str.contains('\-+|기업공시서식|해당사항|다음과 같습니다|참조|참고|^[가-힣]\)|^[가-힣][0-9]\)|^[^가-힣0-9a-zA-Z\%\&]|^[0-9]\.|^[0-9][0-9]\.|^[0-9]\)|^[0-9][0-9]\)|^[0-9]-|^[0-9][0-9]\-|^[가-힣]\.|^[가-힣]\-[0-9]', regex=True)]

    # 보고서 내 중복 제거 & 빈 문자열, 다중 공백 제거
    texts = [' '.join(text.split()) for text in list(filter(None, list(dict.fromkeys(texts)))) if len(' '.join(text.split()).split()) > 3]

    return "\n".join(texts)


def make_disclosure_reports(disclosure, run_table=False):
        
    reports = []
    for i in range(len(disclosure)):    
        
        text = make_text(disclosure['rcept_no'][i], run_table)
        
        if text != []:
#         if text != "":
            reports.append({
                "report_nm": disclosure['report_nm'][i],
                "rcept_no": disclosure['rcept_no'][i],
                "text": text
            })
            
    if reports != []:
#     if reports != "":
        # 보고서 간 중복 제거
        intersection = set(reports[0]['text'])
        for report in reports:
            intersection = intersection & set(report['text'])

        for report in reports:
            report['text'] = [text for text in report['text'] if text not in intersection]


        # 데이터 전처리
        for report in reports:
            if report['text'] != []:
                report['text'] = data_preprocessing(report['text'])
            else:
                report['text'] = ""

    return reports
    

    
    
def save_output(corp_code, corp_name, disclosure_report):
    output_file = open(output_path, "r+", encoding='utf-8')
    output = json.loads(output_file.read())
    output.append({
        "corp_code": corp_code,
        "corp_name": corp_name,
        "disclosure": disclosure_report
    })
    output_file.seek(0)
    json.dump(output, output_file, ensure_ascii=False, indent=4)
    output_file.truncate()

    
    
def make_result(dart_stock):
    
    for idx, (code, comp) in enumerate(zip(dart_stock['corp_code'], dart_stock['corp_name'])):
        print(idx, '/', len(dart_stock))
#         print(code, comp)

        disclosure = dart_list(api_key=api_key, corp_code=code, start='2020-01-01', kind='A', final=True)

        disclosure_report = make_disclosure_reports(disclosure, run_table=False) if not disclosure.empty else []

        save_output(code, comp, disclosure_report=disclosure_report)

    return ""



def main():
    
    with open(output_path, "w", encoding='utf-8') as output_file:
        output_file.write("[\n]")
        
    start = datetime.now()
    print(start)
    
    dart_stock = dart.corp_codes[dart.corp_codes['stock_code'] != ' '].reset_index()
    
    result = make_result(dart_stock)

    print(datetime.now() - start)

    
if __name__ == '__main__':       
    with tf.device("/device:GPU:1"):
        main()



2023-02-14 23:10:04.593008
0 / 3542
1 / 3542
2 / 3542
3 / 3542
4 / 3542
5 / 3542
6 / 3542
7 / 3542
8 / 3542
9 / 3542
10 / 3542
11 / 3542
12 / 3542
13 / 3542
14 / 3542
15 / 3542
16 / 3542
17 / 3542
18 / 3542
19 / 3542
20 / 3542
21 / 3542
22 / 3542
23 / 3542
24 / 3542
25 / 3542
26 / 3542
27 / 3542
28 / 3542
29 / 3542
30 / 3542
31 / 3542
32 / 3542
33 / 3542
34 / 3542
35 / 3542
36 / 3542
37 / 3542
38 / 3542
39 / 3542
40 / 3542
41 / 3542
42 / 3542
43 / 3542
44 / 3542
45 / 3542
46 / 3542
47 / 3542
48 / 3542
49 / 3542
50 / 3542
51 / 3542
52 / 3542
53 / 3542
54 / 3542
55 / 3542
56 / 3542
57 / 3542
58 / 3542
59 / 3542
60 / 3542
61 / 3542
62 / 3542
63 / 3542
64 / 3542
65 / 3542
66 / 3542
67 / 3542
68 / 3542
69 / 3542
70 / 3542
71 / 3542
72 / 3542
73 / 3542
74 / 3542
75 / 3542
76 / 3542
77 / 3542
78 / 3542
79 / 3542
80 / 3542
81 / 3542
82 / 3542
83 / 3542
84 / 3542
85 / 3542
86 / 3542
87 / 3542
88 / 3542
89 / 3542
90 / 3542
91 / 3542
92 / 3542
93 / 3542
94 / 3542
95 / 3542
96 / 3542
97 / 3542
98 

753 / 3542
754 / 3542
755 / 3542
756 / 3542
757 / 3542
758 / 3542
759 / 3542
760 / 3542
761 / 3542
762 / 3542
763 / 3542
764 / 3542
765 / 3542
766 / 3542
767 / 3542
768 / 3542
769 / 3542
770 / 3542
771 / 3542
772 / 3542
773 / 3542
774 / 3542
775 / 3542
776 / 3542
777 / 3542
778 / 3542
779 / 3542
780 / 3542
781 / 3542
782 / 3542
783 / 3542
784 / 3542
785 / 3542
786 / 3542
787 / 3542
788 / 3542
789 / 3542
790 / 3542
791 / 3542
792 / 3542
793 / 3542
794 / 3542
795 / 3542
796 / 3542
797 / 3542
798 / 3542
799 / 3542
800 / 3542
801 / 3542
802 / 3542
803 / 3542
804 / 3542
805 / 3542
806 / 3542
807 / 3542
808 / 3542
809 / 3542
810 / 3542
811 / 3542
812 / 3542
813 / 3542
814 / 3542
815 / 3542
816 / 3542
817 / 3542
818 / 3542
819 / 3542
820 / 3542
821 / 3542
822 / 3542
823 / 3542
824 / 3542
825 / 3542
826 / 3542
827 / 3542
828 / 3542
829 / 3542
830 / 3542
831 / 3542
832 / 3542
833 / 3542
834 / 3542
835 / 3542
836 / 3542
837 / 3542
838 / 3542
839 / 3542
840 / 3542
841 / 3542
842 / 3542
843 / 3542

1457 / 3542
1458 / 3542
1459 / 3542
1460 / 3542
1461 / 3542
1462 / 3542
1463 / 3542
1464 / 3542
1465 / 3542
1466 / 3542
1467 / 3542
1468 / 3542
1469 / 3542
1470 / 3542
1471 / 3542
1472 / 3542
1473 / 3542
1474 / 3542
1475 / 3542
1476 / 3542
1477 / 3542
1478 / 3542
1479 / 3542
1480 / 3542
1481 / 3542
1482 / 3542
1483 / 3542
1484 / 3542
1485 / 3542
1486 / 3542
1487 / 3542
1488 / 3542
1489 / 3542
1490 / 3542
1491 / 3542
1492 / 3542
1493 / 3542
1494 / 3542
1495 / 3542
1496 / 3542
1497 / 3542
1498 / 3542
1499 / 3542
1500 / 3542
1501 / 3542
1502 / 3542
1503 / 3542
1504 / 3542
1505 / 3542
1506 / 3542
1507 / 3542
1508 / 3542
1509 / 3542
1510 / 3542
1511 / 3542
1512 / 3542
1513 / 3542
1514 / 3542
1515 / 3542
1516 / 3542
1517 / 3542
1518 / 3542
1519 / 3542
1520 / 3542
1521 / 3542
1522 / 3542
1523 / 3542
1524 / 3542
1525 / 3542
1526 / 3542
1527 / 3542
1528 / 3542
1529 / 3542
1530 / 3542
1531 / 3542
1532 / 3542
1533 / 3542
1534 / 3542
1535 / 3542
1536 / 3542
1537 / 3542
1538 / 3542
1539 / 3542
1540

2140 / 3542
2141 / 3542
2142 / 3542
2143 / 3542
2144 / 3542
2145 / 3542
2146 / 3542
2147 / 3542
2148 / 3542
2149 / 3542
2150 / 3542
2151 / 3542
2152 / 3542
2153 / 3542
2154 / 3542
2155 / 3542
2156 / 3542
2157 / 3542
2158 / 3542
2159 / 3542
2160 / 3542
2161 / 3542
2162 / 3542
2163 / 3542
2164 / 3542
2165 / 3542
2166 / 3542
2167 / 3542
2168 / 3542
2169 / 3542
2170 / 3542
2171 / 3542
2172 / 3542
2173 / 3542
2174 / 3542
2175 / 3542
2176 / 3542
2177 / 3542
2178 / 3542
2179 / 3542
2180 / 3542
2181 / 3542
2182 / 3542
2183 / 3542
2184 / 3542
2185 / 3542
2186 / 3542
2187 / 3542
2188 / 3542
2189 / 3542
2190 / 3542
2191 / 3542
2192 / 3542
2193 / 3542
2194 / 3542
2195 / 3542
2196 / 3542
2197 / 3542
2198 / 3542
2199 / 3542
2200 / 3542
2201 / 3542
2202 / 3542
2203 / 3542
2204 / 3542
2205 / 3542
2206 / 3542
2207 / 3542
2208 / 3542
2209 / 3542
2210 / 3542
2211 / 3542
2212 / 3542
2213 / 3542
2214 / 3542
2215 / 3542
2216 / 3542
2217 / 3542
2218 / 3542
2219 / 3542
2220 / 3542
2221 / 3542
2222 / 3542
2223

2823 / 3542
2824 / 3542
2825 / 3542
2826 / 3542
2827 / 3542
2828 / 3542
2829 / 3542
2830 / 3542
2831 / 3542
2832 / 3542
2833 / 3542
2834 / 3542
2835 / 3542
2836 / 3542
2837 / 3542
2838 / 3542
2839 / 3542
2840 / 3542
2841 / 3542
2842 / 3542
2843 / 3542
2844 / 3542
2845 / 3542
2846 / 3542
2847 / 3542
2848 / 3542
2849 / 3542
2850 / 3542
2851 / 3542
2852 / 3542
2853 / 3542
2854 / 3542
2855 / 3542
2856 / 3542
2857 / 3542
2858 / 3542
2859 / 3542
2860 / 3542
2861 / 3542
2862 / 3542
2863 / 3542
2864 / 3542
2865 / 3542
2866 / 3542
2867 / 3542
2868 / 3542
2869 / 3542
2870 / 3542
2871 / 3542
2872 / 3542
2873 / 3542
2874 / 3542
2875 / 3542
2876 / 3542
2877 / 3542
2878 / 3542
2879 / 3542
2880 / 3542
2881 / 3542
2882 / 3542
2883 / 3542
2884 / 3542
2885 / 3542
2886 / 3542
2887 / 3542
2888 / 3542
2889 / 3542
2890 / 3542
2891 / 3542
2892 / 3542
2893 / 3542
2894 / 3542
2895 / 3542
2896 / 3542
2897 / 3542
2898 / 3542
2899 / 3542
2900 / 3542
2901 / 3542
2902 / 3542
2903 / 3542
2904 / 3542
2905 / 3542
2906

3506 / 3542
3507 / 3542
3508 / 3542
3509 / 3542
3510 / 3542
3511 / 3542
3512 / 3542
3513 / 3542
3514 / 3542
3515 / 3542
3516 / 3542
3517 / 3542
3518 / 3542
3519 / 3542
3520 / 3542
3521 / 3542
3522 / 3542
3523 / 3542
3524 / 3542
3525 / 3542
3526 / 3542
3527 / 3542
3528 / 3542
3529 / 3542
3530 / 3542
3531 / 3542
3532 / 3542
3533 / 3542
3534 / 3542
3535 / 3542
3536 / 3542
3537 / 3542
3538 / 3542
3539 / 3542
3540 / 3542
3541 / 3542
11:30:14.982495
