In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
AQI_FILEPATH = os.path.join(os.curdir, 'mailiao_aqi.xls')
df = pd.read_excel(AQI_FILEPATH)

In [3]:
df.head()

Unnamed: 0,日期,測站,測項,00,01,02,03,04,05,06,...,14,15,16,17,18,19,20,21,22,23
0,2017/01/01,麥寮,AMB_TEMP,20.0,20.0,19.0,19.0,18.0,17.0,18.0,...,23.0,22.0,22.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
1,2017/01/01,麥寮,CH4,2.0,2.2,2.2,2.4,3.4,3.6,3.3,...,1.8,1.8,1.8,1.8,1.9,1.8,1.9,1.9,1.9,2.2
2,2017/01/01,麥寮,CO,0.2,0.19,0.24,0.24,0.23,0.37,0.42,...,0.22,0.17,0.16,0.16,0.16,0.15,0.15,0.17,0.15,0.15
3,2017/01/01,麥寮,NMHC,0.04,0.05,0.08,0.11,0.12,0.13,0.13,...,0.03,0.03,0.03,0.02,0.02,0.02,0.02,0.02,0.03,0.02
4,2017/01/01,麥寮,NO,1.7,1.9,2.0,1.4,6.8,8.4,8.1,...,1.1,0.6,1.1,0.7,1.3,1.0,0.7,1.1,0.8,1.1


In [4]:
df.columns = [str(col) for col in df.columns]  # 把 Column 的名稱全轉換為 Str Type

In [5]:
# 將 Column 名稱由中文修改為英文
columns = ['date', 'location', 'item'] + list(df.columns[3:])
df.columns = columns

In [6]:
columns_hours = df.columns[3:]

In [7]:
# 用不同的小時，來將全部資料分割為各個 ["時間", "觀測項目", "小時"] 的 DataFrame
hours_split_df = [df.loc[:, ['date', 'item' , hour]] for hour in columns_hours]

In [8]:
hours_split_df[0].head()

Unnamed: 0,date,item,00
0,2017/01/01,AMB_TEMP,20.0
1,2017/01/01,CH4,2.0
2,2017/01/01,CO,0.2
3,2017/01/01,NMHC,0.04
4,2017/01/01,NO,1.7


In [9]:
# 把小時的資料獨立成 "hour" 的 Column
for hour_df in hours_split_df:
    hour_df.loc[:, 'hour'] = hour_df.columns[-1]  # 最後一個 Column 的名稱即為小時
    hour_df.columns = ['date', 'item', 'value', 'hour']  # 幫最後一個 Column 補上名稱

In [10]:
# 合併所有以不同小時區分開來的 DataFrame
df_concat = pd.concat(hours_split_df)

In [11]:
# 製作要轉為日期的欄位
df_concat.loc[:, 'date'] = pd.to_datetime(df_concat['date'])  # 轉為 datetime Type
df_concat.loc[:, 'year'] = df_concat['date'].dt.year  # 年份
df_concat.loc[:, 'month'] = df_concat['date'].dt.month  # 月份
df_concat.loc[:, 'day'] = df_concat['date'].dt.day  # 日期
df_concat.loc[:, 'datetime'] = pd.to_datetime(df_concat[['year', 'month', 'day', 'hour']])  # 用四個欄位製作日期欄位

In [12]:
df_aqi = df_concat.loc[:, ['datetime', 'item', 'value']].copy()  # 保留需要的欄位
# df_aqi.to_csv(os.path.join(os.curdir, 'drive', 'Colab Notebooks', 'Pandas Training', 'Preprocessing', 'aqi_nopivot.csv'), index=False)

In [13]:
df_aqi_pivot = df_aqi.pivot(index='datetime', columns='item', values='value')  # 用樞紐分析表轉置
df_aqi_pivot.replace(r'[-]?\D*[.]?\D*[#*x]+', np.nan, regex=True, inplace=True)  # 把 *, #, x 等特殊符號給置換為 NaN

In [14]:
df_aqi_pivot.head()

item,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,NR,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,NR,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,NR,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,NR,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,NR,90.0,1.7,3.6,96.0,106.0,1.0,0.5
