In [1]:
import pandas as pd

# 데이터 집계하는 범위 
date_range = pd.date_range(start='2018-01-01', end='2022-06-30', freq='H')

df_all = pd.DataFrame(date_range, columns=['date'])

df_all['Year'] = df_all['date'].dt.year
df_all['Month'] = df_all['date'].dt.month
df_all['Day'] = df_all['date'].dt.day
df_all['UTCHour'] = df_all['date'].dt.hour
df_all = df_all.drop('date', axis=1)

df_all.set_index(['Year','Month','Day','UTCHour'])

df_all.head()

Unnamed: 0,Year,Month,Day,UTCHour
0,2018,1,1,0
1,2018,1,1,1
2,2018,1,1,2
3,2018,1,1,3
4,2018,1,1,4


## 서울, 베이징 미세먼지 데이터

In [2]:
df_seoul_pm_2008_2011 = pd.read_csv('./rawfiles/seoul-pm/2008-2011.csv', encoding='cp949')
df_seoul_pm_2012_2015 = pd.read_csv('./rawfiles/seoul-pm/2012-2015.csv', encoding='cp949')
df_seoul_pm_2016_2019 = pd.read_csv('./rawfiles/seoul-pm/2016-2019.csv', encoding='cp949')
df_seoul_pm_2020_2021 = pd.read_csv('./rawfiles/seoul-pm/2020-2021.csv', encoding='cp949')
df_seoul_pm_2022 = pd.read_csv('./rawfiles/seoul-pm/2022.csv', encoding='cp949')

df_seoul_pm = pd.concat([
    df_seoul_pm_2008_2011, df_seoul_pm_2012_2015, df_seoul_pm_2016_2019, df_seoul_pm_2020_2021, df_seoul_pm_2022
], axis=0)

df_seoul_pm = df_seoul_pm.groupby('일시').first() # 지역별 평균 값만 담는다.

In [3]:
df_seoul_pm['Year'] = df_seoul_pm.index.str[:4].astype('int')
df_seoul_pm['Month'] = df_seoul_pm.index.str[5:7].astype('int')
df_seoul_pm['Day'] = df_seoul_pm.index.str[8:10].astype('int')
df_seoul_pm['UTCHour'] = df_seoul_pm.index.str.extract(r' (\d+):', expand=False).astype('int')

In [4]:
df_seoul_pm = df_seoul_pm.drop('구분', axis=1)

In [5]:
df_seoul_pm['Seoul_PM2.5'] = df_seoul_pm['초미세먼지(PM2.5)'].combine_first(df_seoul_pm['초미세먼지(PM25)'])
df_seoul_pm = df_seoul_pm.drop('초미세먼지(PM2.5)', axis=1)
df_seoul_pm = df_seoul_pm.drop('초미세먼지(PM25)', axis=1)
df_seoul_pm = df_seoul_pm.rename(columns={'미세먼지(PM10)': 'Seoul_PM10'})

In [6]:
df_seoul_pm

Unnamed: 0_level_0,Seoul_PM10,Year,Month,Day,UTCHour,Seoul_PM2.5
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-01 10:00,30.0,2008,1,1,10,11.0
2008-01-01 11:00,29.0,2008,1,1,11,13.0
2008-01-01 12:00,29.0,2008,1,1,12,12.0
2008-01-01 13:00,28.0,2008,1,1,13,12.0
2008-01-01 14:00,27.0,2008,1,1,14,13.0
...,...,...,...,...,...,...
2022-12-31 5:00,33.0,2022,12,31,5,25.0
2022-12-31 6:00,34.0,2022,12,31,6,25.0
2022-12-31 7:00,34.0,2022,12,31,7,26.0
2022-12-31 8:00,34.0,2022,12,31,8,25.0


In [7]:
# 베이징 미세먼지 데이터
df_beijing_pm25 = pd.read_csv('./rawfiles/beijing-pm.txt', sep='\s+')
df_beijing_pm25 = df_beijing_pm25.drop(['Retrospective', 'PM10_mask'], axis=1)
df_beijing_pm25 = df_beijing_pm25.rename(columns={'PM2.5': 'Beijing_PM2.5'})
df_beijing_pm25 = df_beijing_pm25.groupby(['Year','Month','Day','UTCHour']).mean()

df_beijing_pm10 = pd.read_csv('https://raw.githubusercontent.com/SeojinSeojin/data-storage/main/all_air.csv')
df_beijing_pm10 = df_beijing_pm10.replace(-9999, float("nan"))
df_beijing_pm10 = df_beijing_pm10.rename(columns={'year':'Year', 'month':'Month', 'date':'Day', 'hour':'UTCHour', 'PM10': 'Beijing_PM10'})

In [8]:
df_beijing_pm10 = df_beijing_pm10.loc[ : ,['Year','Month','Day','UTCHour','Beijing_PM10']]

df_beijing_pm = pd.merge(df_beijing_pm25, df_beijing_pm10, on=['Year', 'Month', 'Day', 'UTCHour'])

# 위의 df_all로 left outer join
# left join을 하는 이유: 2016년 12월 10일 1시 데이터가 df_seoul_pm에 없더라도, 모두 NaN으로 입력될 수 있도록
df = pd.merge(df_all, df_seoul_pm, on=['Year', 'Month', 'Day', 'UTCHour'], how='left')
df = pd.merge(df, df_beijing_pm, on=['Year', 'Month', 'Day', 'UTCHour'], how='left')

df[['Seoul_PM2.5', 'Seoul_PM10', 'Beijing_PM2.5', 'Beijing_PM10']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Seoul_PM2.5,39385.0,21.885718,16.906571,1.0,11.0,18.0,28.0,162.0
Seoul_PM10,39385.0,38.494401,30.767995,3.0,21.0,33.0,48.0,868.0
Beijing_PM2.5,36897.0,35.400181,31.434107,2.07,13.39,26.07,45.6,340.52
Beijing_PM10,35642.0,67.376354,89.626953,1.0,29.0,53.0,85.0,6450.0


In [9]:
df

Unnamed: 0,Year,Month,Day,UTCHour,Seoul_PM10,Seoul_PM2.5,Beijing_PM2.5,Beijing_PM10
0,2018,1,1,0,36.0,18.0,25.625,102.0
1,2018,1,1,1,36.0,18.0,33.540,95.0
2,2018,1,1,2,35.0,17.0,45.430,
3,2018,1,1,3,34.0,18.0,58.410,
4,2018,1,1,4,34.0,18.0,68.650,
...,...,...,...,...,...,...,...,...
39380,2022,6,29,20,12.0,6.0,10.010,16.0
39381,2022,6,29,21,9.0,4.0,9.990,18.0
39382,2022,6,29,22,6.0,2.0,9.860,19.0
39383,2022,6,29,23,7.0,3.0,10.500,20.0


In [10]:
# 결측치 값 개수 출력

print('# of null Seoul pm2.5 values : ', df['Seoul_PM2.5'].isna().sum())
print('# of null Seoul pm10 values : ', df['Seoul_PM10'].isna().sum())
print('# of null Beijing pm2.5 values : ', df['Beijing_PM2.5'].isna().sum())
print('# of null Beijing pm10 values : ', df['Beijing_PM10'].isna().sum())

# of null Seoul pm2.5 values :  0
# of null Seoul pm10 values :  0
# of null Beijing pm2.5 values :  2488
# of null Beijing pm10 values :  3743


## 서울 대기질 데이터

In [11]:
sa_all = pd.read_csv("./rawfiles/seoul-air.csv")

In [12]:
sa_all

Unnamed: 0.1,Unnamed: 0,일시,초미세먼지,미세먼지,오존,이산화탄소,일산화탄소,아황산가스
0,0,2018-01-01 01:00,17,31,0.018,0.021,0.4,0.005
1,1,2018-01-01 02:00,16,29,0.019,0.020,0.4,0.005
2,2,2018-01-01 03:00,13,28,0.014,0.025,0.4,0.005
3,3,2018-01-01 04:00,16,33,0.013,0.025,0.4,0.005
4,4,2018-01-01 05:00,17,28,0.012,0.026,0.5,0.004
...,...,...,...,...,...,...,...,...
43795,43795,2022-12-31 20:00,57,68,0.003,0.063,1.3,0.004
43796,43796,2022-12-31 21:00,60,69,0.003,0.064,1.4,0.004
43797,43797,2022-12-31 22:00,59,67,0.003,0.064,1.4,0.004
43798,43798,2022-12-31 23:00,57,65,0.003,0.066,1.5,0.004


In [13]:

sa_all = sa_all.rename(columns={'오존':'Seoul_O3', '이산화탄소':'Seoul_CO2', '일산화탄소':'Seoul_CO', '아황산가스':'Seoul_SO2'})

sa_all[['date', 'time']] = sa_all['일시'].str.split(' ', expand=True)
sa_all['time'] = sa_all['time'].str.replace('24:00', '00:00')
sa_all['datetime'] = pd.to_datetime(sa_all['date'] + ' ' + sa_all['time'], format='%Y-%m-%d %H:%M')

sa_all['Year'] = pd.to_datetime(sa_all['datetime']).dt.year
sa_all['Month'] = pd.to_datetime(sa_all['datetime']).dt.month
sa_all['Day'] = pd.to_datetime(sa_all['datetime']).dt.day
sa_all['UTCHour'] = pd.to_datetime(sa_all['datetime']).dt.hour

sa_all = sa_all.drop(sa_all.columns[0], axis=1)
sa_all = sa_all.drop(['일시','초미세먼지','미세먼지', 'date', 'time', 'datetime'], axis=1)

sa_all.head()

Unnamed: 0,Seoul_O3,Seoul_CO2,Seoul_CO,Seoul_SO2,Year,Month,Day,UTCHour
0,0.018,0.021,0.4,0.005,2018,1,1,1
1,0.019,0.02,0.4,0.005,2018,1,1,2
2,0.014,0.025,0.4,0.005,2018,1,1,3
3,0.013,0.025,0.4,0.005,2018,1,1,4
4,0.012,0.026,0.5,0.004,2018,1,1,5


In [14]:
sa_all.columns

Index(['Seoul_O3', 'Seoul_CO2', 'Seoul_CO', 'Seoul_SO2', 'Year', 'Month',
       'Day', 'UTCHour'],
      dtype='object')

In [15]:
df = pd.merge(df, sa_all, on=['Year', 'Month', 'Day', 'UTCHour'], how='left')

In [16]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,39409.0,2019.774113,1.312217,2018.0,2019.0,2020.0,2021.0,2022.0
Month,39409.0,6.19427,3.437543,1.0,3.0,6.0,9.0,12.0
Day,39409.0,15.698292,8.793777,1.0,8.0,16.0,23.0,31.0
UTCHour,39409.0,11.499708,6.922429,0.0,5.0,11.0,17.0,23.0
Seoul_PM10,39409.0,38.495166,30.759383,3.0,21.0,33.0,48.0,868.0
Seoul_PM2.5,39409.0,21.890025,16.903461,1.0,11.0,18.0,28.0,162.0
Beijing_PM2.5,36921.0,35.38613,31.429246,2.07,13.38,26.06,45.58,340.52
Beijing_PM10,35666.0,67.351119,89.602676,1.0,29.0,53.0,85.0,6450.0
Seoul_O3,39361.0,0.025557,0.018418,0.001,0.011,0.023,0.035,0.163
Seoul_CO2,39361.0,0.026728,0.014714,0.003,0.015,0.023,0.036,0.094


## 화력 발전량 데이터

In [17]:
thermalPower = pd.read_csv("./rawfiles/korea-thermal-power.csv", encoding='cp949')

thermalPower.head(35)

Unnamed: 0,거래일자,지역,연료원,발전량(MWh)
0,2018-01-01,강원도,LNG,7003
1,2018-01-01,강원도,석탄,46702
2,2018-01-01,경기도,LNG,114569
3,2018-01-01,경기도,석탄,1095
4,2018-01-01,경기도,유류,722
5,2018-01-01,경상남도,석탄,143844
6,2018-01-01,경상남도,유류,183
7,2018-01-01,경상북도,LNG,0
8,2018-01-01,경상북도,석탄,2742
9,2018-01-01,광주시,LNG,1784


지역별 발전량으로 합친다.

| 일자 | 강원도_LNG | 강원도_석탄 | 경기도_LNG | 경기도_석탄 | 경기도_유류 | 경상북도_LNG | 경상북도_석탄 | 광주_LNG | 대구_LNG | 대구_석탄 | 대구_유류 | 대전_LNG | 대전_유류 | 부산_LNG | 부산_석탄 | 서울_LNG | 세종_LNG | 울산_LNG | 울산_유류 | 인천_LNG | 인천_석탄 | 전라남도_LNG | 전라남도_석탄 | 전라북도_LNG | 전라북도_석탄 | 전라북도_유류 | 제주도_유류 | 충청남도_LNG | 충청남도_석탄 | 충청남도_유류 | 충청북도_유류 |
| -- | -- | -- | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| date | GW_LNG | GW_Coal | GG_LNG | GG_Coal | GG_유류 | GB_LNG | GB_Coal | Gwangju_LNG | Daegu_LNG | Daegu_Coal | Daegu_Oil | Daejeon_LNG | Daejeon_Oil | Busan_LNG | Busan_Coal | Seoul_LNG | Sejong_LNG | Ulsan_LNG | Ulsan_Oil | Incheon_LNG | Incheon_Coal | JN_LNG | JN_Coal | JB_LNG | JB_Coal | JB_Oil | Jeju_Oil | CN_LNG | CN_Coal | CN_Oil | CB_Oil |

In [18]:
date_range = pd.date_range(start='2018-01-01', end='2022-06-30', freq='D')

th_data_all = pd.DataFrame(date_range, columns=['date'])

dict_location = {'강원도':'GW', '경기도':'GG', '경상북도':'GB','경상남도':'GN', '광주시':'Gwangju', '대구시':'Daegu','대전시':'Daejeon','부산시':'Busan','서울시':'Seoul','세종시':'Sejong','울산시':'Ulsan','인천시':'Incheon','전라남도':'JN','전라북도':'JB','제주도':'Jeju','충청남도':'CN','충청북도':'CB' }
dict_material = {'LNG':'LNG', '석탄':'Coal','유류':'Oil'}

thermalPower = thermalPower.rename(columns={' 발전량(MWh) ':'power'})

dict_all = dict()

for index, row in thermalPower.iterrows():
    time = row['거래일자']
    location = row['지역']
    material = row['연료원']
    power = row['power']
    key = dict_location[location] + "_" + dict_material[material]
    if time in dict_all.keys():
        prevValue = dict_all[time]
        prevValue[key] = power
    else:
        v = dict()
        v[key] = power
        dict_all[time] = v

In [19]:
thermal_power_pd = pd.DataFrame.from_dict(dict_all, orient='index')
thermal_power_pd['date'] = pd.to_datetime(thermal_power_pd.index)
th_data_all['date'] = th_data_all['date']

In [20]:
th_data_all = pd.merge(th_data_all, thermal_power_pd, on=['date'], how='left') 
th_data_all.head()

Unnamed: 0,date,GW_LNG,GW_Coal,GG_LNG,GG_Coal,GG_Oil,GN_Coal,GN_Oil,GB_LNG,GB_Coal,...,JN_Coal,JB_LNG,JB_Coal,JB_Oil,Jeju_Oil,CN_LNG,CN_Coal,CN_Oil,CB_Oil,Jeju_LNG
0,2018-01-01,7003,46702,114569,1095,722,143844,183,0,2742,...,26116,0,19139,404,6834,10867,290788,0,1107,
1,2018-01-02,15290,38744,220946,1303,15178,159881,275,4481,2683,...,28611,7107,19027,402,8006,21904,334588,0,721,
2,2018-01-03,17401,39059,237802,1404,13141,164212,349,8662,2632,...,31074,7873,19035,397,6160,28111,345349,0,322,
3,2018-01-04,17937,39043,248705,1410,17898,164620,288,9033,2636,...,30976,10538,19094,393,7588,32898,340057,0,1128,
4,2018-01-05,16957,39233,241342,1406,14630,164327,323,8518,2649,...,31193,10080,19138,402,7255,27683,348762,0,1153,


In [21]:
th_data_all['Year'] = pd.to_datetime(th_data_all['date']).dt.year
th_data_all['Month'] = pd.to_datetime(th_data_all['date']).dt.month
th_data_all['Day'] = pd.to_datetime(th_data_all['date']).dt.day
th_data_all['UTCHour'] = 0 # 일별 단위이므로 시간은 일단 0으로 밀어넣는다.

In [22]:
th_data_all = th_data_all.drop('date', axis=1)

In [23]:
df = pd.merge(df, th_data_all, on=['Year', 'Month', 'Day', 'UTCHour'], how='left')

In [24]:
df.head(25)

Unnamed: 0,Year,Month,Day,UTCHour,Seoul_PM10,Seoul_PM2.5,Beijing_PM2.5,Beijing_PM10,Seoul_O3,Seoul_CO2,...,JN_Coal,JB_LNG,JB_Coal,JB_Oil,Jeju_Oil,CN_LNG,CN_Coal,CN_Oil,CB_Oil,Jeju_LNG
0,2018,1,1,0,36.0,18.0,25.625,102.0,0.002,0.05,...,26116.0,0.0,19139.0,404.0,6834.0,10867.0,290788.0,0.0,1107.0,
1,2018,1,1,1,36.0,18.0,33.54,95.0,0.018,0.021,...,,,,,,,,,,
2,2018,1,1,2,35.0,17.0,45.43,,0.019,0.02,...,,,,,,,,,,
3,2018,1,1,3,34.0,18.0,58.41,,0.014,0.025,...,,,,,,,,,,
4,2018,1,1,4,34.0,18.0,68.65,,0.013,0.025,...,,,,,,,,,,
5,2018,1,1,5,34.0,17.0,66.52,,0.012,0.026,...,,,,,,,,,,
6,2018,1,1,6,34.0,19.0,59.72,,0.01,0.028,...,,,,,,,,,,
7,2018,1,1,7,35.0,19.0,60.07,,0.008,0.03,...,,,,,,,,,,
8,2018,1,1,8,35.0,19.0,59.3,,0.006,0.034,...,,,,,,,,,,
9,2018,1,1,9,35.0,19.0,55.48,,0.007,0.034,...,,,,,,,,,,


## 모두 합쳐진 csv 파일로 내보내기

In [25]:
df.to_csv('all.csv', index=False)