### Code Hist.

 - CODE : Crawling - 한국환경공단 대기오염 (KECO Airkorea) 관측정보

 - DESC  
    &ensp; 1) Open API 사용 불가.  
    &emsp;&ensp;&ensp; 특히, 시간별 통계자료는 Air Korea에서 직접 다운로드  
    &emsp;&ensp;&ensp; (https://www.airkorea.or.kr/web/last_amb_hour_data)  
    &ensp; 2) 결측치가 너무 많아서 KECO_AIR Korea 데이터 사용 고려중  
    
  - DATE  
    &ensp; 2023-05-26 Created  
    &ensp; 2023-11-27 Lab Updated  
    &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 1) 기존 Template에 맞게 통일  
    &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 2) 공통코드 사용 (.ipynb)  

# 01. Code

## 01-01. Init

### 01-01-01. Init_Module Import

In [2]:
#region Basic_Import
## Basic
import os, sys, warnings
os.path.dirname(os.path.abspath('./__file__'))
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('./__file__'))))
warnings.filterwarnings('ignore')

import numpy as np, pandas as pd
from pandas import DataFrame, Series
pd.options.display.float_format = '{:.10f}'.format

## Datetime
import time, datetime as dt
from datetime import datetime, date, timedelta

## glob
import glob, requests, json
from glob import glob

from scipy import stats

## Excel/CSV
import openpyxl, xlrd

import urllib
from urllib.request import urlopen
from urllib.parse import urlencode, unquote, quote_plus

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

## 시각화
import seaborn as sns, matplotlib.pyplot as plt
# %matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

#endregion Basic_Import

In [3]:
## Import_DL
str_tar = "tf"
## For Torch
if str_tar == "torch":
    import torch, torch.nn as nn
    from torch.nn.utils import weight_norm
    print("Torch Imported")
## For TF
elif str_tar == "tf":
    import tensorflow as tf, tensorflow_addons as tfa
    print("Tensorflow Imported")
else:
    print("Error : Cannot be used except for Keywords")
    print(" : torch / tf")

Tensorflow Imported


In [4]:
## Import_Local
from Src_Dev_Common import Data_Datetime as com_date, KMA_Weather as com_KMA, KECO_AirKor as com_KECO, KASI_Holiday as com_Holi

### 01-01-02. Config (Directory, Params)

In [5]:
## Init_config
SEED = 42

np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = "1"
# random.seed(SEED)

In [6]:
## Define Todate str
str_now_ymd = pd.datetime.now().date()
str_now_y, str_now_m, str_now_d = pd.datetime.now().year, pd.datetime.now().month, pd.datetime.now().day
str_now_hr, str_now_min = pd.datetime.now().hour, pd.datetime.now().minute

print(pd.datetime.now())
print(str(str_now_y) + " / " + str(str_now_m)  + " / " + str(str_now_d))
print(str(str_now_hr) + " : " + str(str_now_min))

2024-07-19 19:02:02.236909
2024 / 7 / 19
19 : 2


In [7]:
# Define data root directory
str_dir_Airkor_HR = "../data_KECO_Airkor_HR/"
str_dir_Quarter = "../data_KECO_Airkor_HR/file_Quarter/"
print(os.listdir(str_dir_Quarter))

list_file_quarter = glob(str_dir_Quarter + "*Q*")
print(len(list_file_quarter))
list_file_quarter

['KECO_AIRKOR_2010-1Q.xlsx', 'KECO_AIRKOR_2010-2Q.xlsx', 'KECO_AIRKOR_2010-3Q.xlsx', 'KECO_AIRKOR_2010-4Q.xlsx', 'KECO_AIRKOR_2011-1Q.xlsx', 'KECO_AIRKOR_2011-2Q.xlsx', 'KECO_AIRKOR_2011-3Q.xlsx', 'KECO_AIRKOR_2011-4Q.xlsx', 'KECO_AIRKOR_2012-1Q.xlsx', 'KECO_AIRKOR_2012-2Q.xlsx', 'KECO_AIRKOR_2012-3Q.xlsx', 'KECO_AIRKOR_2012-4Q.xlsx', 'KECO_AIRKOR_2013-1Q.xlsx', 'KECO_AIRKOR_2013-2Q.xlsx', 'KECO_AIRKOR_2013-3Q.xlsx', 'KECO_AIRKOR_2013-4Q.xlsx', 'KECO_AIRKOR_2014-1Q.csv', 'KECO_AIRKOR_2014-2Q.csv', 'KECO_AIRKOR_2014-3Q.csv', 'KECO_AIRKOR_2014-4Q.csv', 'KECO_AIRKOR_2015-1Q.csv', 'KECO_AIRKOR_2015-2Q.csv', 'KECO_AIRKOR_2015-3Q.csv', 'KECO_AIRKOR_2015-4Q.csv', 'KECO_AIRKOR_2016-1Q.csv', 'KECO_AIRKOR_2016-2Q.csv', 'KECO_AIRKOR_2016-3Q.csv', 'KECO_AIRKOR_2016-4Q.csv', 'KECO_AIRKOR_2018-1Q.xlsx', 'KECO_AIRKOR_2018-2Q.xlsx', 'KECO_AIRKOR_2018-3Q.xlsx', 'KECO_AIRKOR_2018-4Q.xlsx']
32


['../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2010-1Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2010-2Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2010-3Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2010-4Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2011-1Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2011-2Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2011-3Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2011-4Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2012-1Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2012-2Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2012-3Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2012-4Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2013-1Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2013-2Q.xlsx',
 '../data_KECO_Airkor_HR/file_Quarter\\KECO_AIRKOR_2013-3Q.xlsx',
 '../data_

## 01-02. Data Load (df_raw)

In [12]:
## Quarter Xlsx/CSV -> Monthly CSV 변환
for i in range(0, len(list_file_quarter)):
    str_file = str(list_file_quarter[i])[len(str_dir_Quarter):-5]
    str_execute = str(list_file_quarter[i])[-4:]
    print(str_file)

    ## 엑셀파일인 경우
    if str_execute == 'xlsx' : 
        print('xlsx')
        Data_KECO_AIR_tmp = pd.read_excel(str_dir_Quarter + str_file + '.xlsx') # , sheet_name="Sheet1", engine='openpyxl')
        Data_KECO_AIR_tmp = com_KECO.Rename_KECO_AirKor(Data_KECO_AIR_tmp).reset_index()
    ## 엑셀파일이 아닌 경우 (CSV)
    else : 
        print(str_file)
        try : Data_KECO_AIR_tmp = pd.read_csv(str_dir_Quarter + str_file + '.csv', encoding = 'cp949') 
        except UnicodeDecodeError : # UnicodeDecodeError: 'cp949' codec can't decode byte 0xa7 in position 4: illegal multibyte sequence
            Data_KECO_AIR_tmp = pd.read_csv(str_dir_Quarter + str_file + '.csv', encoding = 'utf-8') 
        Data_KECO_AIR_tmp = com_KECO.Rename_KECO_AirKor(Data_KECO_AIR_tmp).reset_index()
    
    Data_KECO_AIR_tmp['YEAR'], Data_KECO_AIR_tmp['MONTH'], Data_KECO_AIR_tmp['DAY'], Data_KECO_AIR_tmp['HOUR'] = 0, 0, 0, 0

    # 시간값 복원
    for i in range(0, len(Data_KECO_AIR_tmp)):
        ## String으로 입력된 시간값으로부터 YMDH 추출
        Data_KECO_AIR_tmp['YEAR'].iloc[i] = int(str(Data_KECO_AIR_tmp['METER_DATE'].iloc[i])[:4])
        Data_KECO_AIR_tmp['MONTH'].iloc[i] = int(str(Data_KECO_AIR_tmp['METER_DATE'].iloc[i])[4:6])
        Data_KECO_AIR_tmp['DAY'].iloc[i] = int(str(Data_KECO_AIR_tmp['METER_DATE'].iloc[i])[6:8])
        Data_KECO_AIR_tmp['HOUR'].iloc[i] = int(str(Data_KECO_AIR_tmp['METER_DATE'].iloc[i])[8:10]) - 1

    ## 추출된 YMDH를 기반으로 METER_DATE Column 구성 (DATETIME 형식)
    Data_KECO_AIR_tmp = com_date.create_col_datetime(Data_KECO_AIR_tmp, 'METER_DATE', 'YEAR', 'MONTH', 'DAY', 'HOUR')
    ## 구성된 METER_DATE를 기반으로 YMDHM 재구성 및 전체적인 컬럼 정리
    Data_KECO_AIR_tmp = com_date.create_col_ymdhm(Data_KECO_AIR_tmp, 'METER_DATE')[['METER_DATE'
                                                                                , 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE'
                                                                                , 'REGION', 'CD_OBSERVATORY', 'NM_OBSERVATORY'
                                                                                , 'SO2', 'CO', 'O3', 'NO2', 'PM10']]
    ## 현재 파일의 분기로부터 포함된 월 추출
    list_month = list(Data_KECO_AIR_tmp['MONTH'].drop_duplicates())

    for tar_month in list_month:
        print(tar_month)
        str_file = str(list_file_quarter[0])[len(str_dir_Quarter):-7] + tar_month.zfill(2)
        ## 해당 월 정보만 추출
        df_tar_month = Data_KECO_AIR_tmp[(Data_KECO_AIR_tmp['MONTH'] == tar_month)]
        ## csv Export
        df_tar_month.to_csv(str_dir_Airkor_HR + str_file)

KECO_AIRKOR_2010-1Q
xlsx


: 