In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_rows', 20)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [3]:
AQI_FILEPATH = os.path.join(os.curdir, 'data', 'aqi.csv')

df = pd.read_csv(filepath_or_buffer=AQI_FILEPATH, parse_dates=['datetime'], index_col='datetime', na_values='NR')
df_bak = df.copy()

# 保留降雨量為 NR 值的資料
df_withnr = pd.read_csv(filepath_or_buffer=AQI_FILEPATH, parse_dates=['datetime'], index_col='datetime')
df_withnr_bak = df_withnr.copy()

# 保留還未將 datetime 設為 index 的資料
df_noindex = pd.read_csv(filepath_or_buffer=AQI_FILEPATH)
df_noindex_bak = df_noindex.copy()

# 讀取未處理過的原始資料
AQI_ORIG_FILEPATH = os.path.join(os.curdir, 'data', 'aqi_original.csv')

df_orig = pd.read_csv(AQI_ORIG_FILEPATH)
df_orig_bak = df_orig.copy()

# 讀取還未使用 pivot_table() 處理過的資料
AQI_NOPIVOT_FILEPATH = os.path.join(os.curdir, 'data', 'aqi_nopivot.csv')

df_nopivot = pd.read_csv(AQI_NOPIVOT_FILEPATH, na_values=['NR'])
df_nopivot.replace(r'[-]?\D*[.]?\D*[#*x]+', np.nan, regex=True, inplace=True)
df_nopivot.loc[:, 'value'] = df_nopivot.loc[:, 'value'].astype(np.float64)
df_nopivot_bak = df_orig.copy()

In [4]:
df.head()

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5


# 第 4 章：挑選要觀察的資料

看完資料的樣態以後，我們開始會從裡面擷取我們要的資料，以便進入資料分析的階段。此時可能會需要比對特定的條件，對部分資料執行運算，以產生原始資料中沒有提供的部分，好讓資料分析進行得更順利。


## Boolean Indexing：使用 Boolean Vector 取得特定條件下的資料

在選擇資料時，常常有很多情況是要選擇**符合一定條件**的資料。在 Pandas 裡面，我們必須先將這些條件組合成一個僅包含 `True` 或是 `False` 的布林向量 (Boolean vector)，再與原本的資料做比對，標示為 True 的 index 會被保留，而 False 則捨去，依此過濾出我們要查詢的資料。

參考文件：

* [10 Minutes to pandas | Selection | Boolean Indexing](https://pandas.pydata.org/pandas-docs/stable/10min.html#boolean-indexing)
* [Boolean indexing](http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing)

### 單一條件比對

如果只需過濾一個條件，則使用單一 boolean vector 與原本的資料做比對。

In [5]:
# 例：過濾雨量 (Column label: 'RAINFALL') 記錄不為 'NR' 的資料
# 產生一個 boolean vector
not_nr = df_withnr.loc[:,'RAINFALL'] != 'NR'
# 過濾資料
df_withnr.loc[not_nr]

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-03 07:00:00,,,,,,,,,,,,,,,,,,
2017-01-03 08:00:00,,,,,,,,,100.0,67.0,,,,,,,,
2017-01-14 03:00:00,17.0,2.0,0.56,0.08,1.0,19.0,20.0,28.0,40.0,27.0,0.2,86.0,3.9,2.0,57.0,59.0,4.8,2.8
2017-01-14 07:00:00,16.0,2.0,0.51,0.07,1.6,13.0,15.0,33.0,35.0,26.0,0.8,84.0,4.0,2.1,64.0,66.0,6.3,4.3
2017-01-14 08:00:00,15.0,2.0,0.61,0.07,2.3,20.0,22.0,22.0,35.0,26.0,1.6,87.0,3.6,2.1,58.0,56.0,5.3,3.2
2017-01-14 09:00:00,16.0,2.0,0.56,0.08,4.0,22.0,26.0,21.0,32.0,23.0,0.6,88.0,4.0,2.1,62.0,76.0,5.7,3.1
2017-01-17 14:00:00,21.0,1.8,0.18,0.02,1.7,4.3,6.0,48.0,82.0,,,77.0,,1.9,87.0,56.0,6.6,0.8
2017-01-25 10:00:00,18.0,,,,,,,42.0,,,,73.0,,,71.0,92.0,7.2,3.2
2017-02-01 03:00:00,,,,,,,,,,,,,,,,,,
2017-02-03 13:00:00,,,,,,,,,102.0,20.0,,,,,,104.0,6.6,


#### 使用 Boolean Vector 來檢視遺漏值數量

有一些 method 可以用來產生比對遺漏值的 boolean vector，例如

* `.isna()`, `.isnull()`（`.isna()` 的別名）：將帶有遺漏值的位置標記為 `True`
* `.notna()`, `.notnull()`（`.notna()` 的別名）：將不帶有遺漏值的位置標記為 `True`

透過這些 method，能用來觀察資料的遺漏值狀況，也可以過濾掉遺漏值或非遺漏值。

參考文件：

* [pandas.Series.isna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.isna.html)
* [pandas.Series.notna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.notna.html)
* [pandas.DataFrame.isna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.isna.html)
* [pandas.DataFrame.notna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.notna.html)
* [pandas.isna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.isna.html)
* [pandas.notna](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.notna.html)

> 備註—有關遺漏值的兩三事：
>
> 所謂的**遺漏值，在文件中常稱為 NA (Not available)**，包含 `None`  或  `NaN` (`numpy.NaN`)，
>
> 但是 `' '`（空白字元）或 `numpy.inf`（無限大數）則不屬於 **NA**。

In [6]:
# 先來觀察一下遺漏值
# 假設已事先知道 2017-01-02 的時候，二氧化硫 (column label: 'SO2') 沒有監測數據
df.loc['2017-01-02 02:00:00', 'SO2']

nan

In [7]:
# 如果要過濾掉二氧化硫數據的遺漏值，就可以搭配 .notna() method 來使用
so2_notnull = df.loc[:, 'SO2'].notna()
df.loc[:, 'SO2'][so2_notnull]

# 不過這樣太麻煩了，預告一下：這個情境搭配 dropna() method 更快，
# 在後面的章節會講解。
# df.loc[:, 'SO2'].dropna()

datetime
2017-01-01 00:00:00    1.5
2017-01-01 01:00:00    2.1
2017-01-01 02:00:00    2.0
2017-01-01 03:00:00    1.8
2017-01-01 04:00:00    1.7
2017-01-01 05:00:00    2.0
2017-01-01 06:00:00    2.0
2017-01-01 07:00:00    2.0
2017-01-01 08:00:00    1.9
2017-01-01 09:00:00    3.0
                      ... 
2017-12-31 14:00:00    2.3
2017-12-31 15:00:00    3.4
2017-12-31 16:00:00    3.7
2017-12-31 17:00:00    4.0
2017-12-31 18:00:00    3.8
2017-12-31 19:00:00    3.8
2017-12-31 20:00:00    4.8
2017-12-31 21:00:00    4.3
2017-12-31 22:00:00    6.0
2017-12-31 23:00:00    6.8
Name: SO2, Length: 8648, dtype: float64

### 多重條件比對

若有多個條件需要比對時，要使用 **operators（運算子）**將多個條件組合成 boolean vector，而個別條件則建議**使用 `()` **將其集合起來。

> 備註：
>
> 不用括號將各個條件給集合起來，可能會遇到的問題是：程式或許不會如我們想像的方式來執行。
>
> 例：`df.A > 2 & df.B < 3` 沒有使用 `()` 將兩個條件先各自集合，程式就會解讀成 `df.A > (2 & df.B) < 3`，而不是解讀成 `(df.A > 2) & (df.B < 3)`。
>
> 這樣的運作應該跟運算子的優先級有關，詳細請參考 [Python 官方文件的 Operator precedence 章節](https://docs.python.org/3/reference/expressions.html#operator-precedence)。

常見的運算子以及範例：

* `|`：代表 `or`（或）
    * 例：取得 column 為「PM2.5 小於 1」**或**「大於 20」的資料
    * Boolean vector: `(df['PM2.5'] < 1) | (df['PM2.5'] > 20)`
* `&`：代表 `and`（且）
    * 例：取得 column 為「PM2.5 大於等於 1」**且**「小於等於 20」的資料
    * Boolean vector: `(df['PM2.5'] >= 1) & (df['PM2.5'] <= 20)`
* `~`：代表 `not`（不是）
    * 例：取得 column 為 「PM2.5 不小於 20」的資料
    * Boolean vector: `~(df['PM2.5'] < 20)`


In [8]:
# 例：取得資料中
# 甲烷濃度 (column label: 'CH4') 介於 2~6（含下限，不含上限）的資料
ch4_bigger_than_2 = df.loc[:,'CH4'] >= 2
ch4_smaller_than_6 = df.loc[:,'CH4'] < 6
df.loc[ch4_bigger_than_2 & ch4_smaller_than_6]
# 以上操作與以下等價
# df.loc[(df.loc[:,'CH4'] >= 2) & (ch4_smaller_than_6)]

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.20,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5
2017-01-01 05:00:00,17.0,3.6,0.37,0.13,8.4,12.0,20.0,2.0,57.0,28.0,,91.0,2.0,3.7,79.0,114.0,0.8,0.7
2017-01-01 06:00:00,18.0,3.3,0.42,0.13,8.1,12.0,20.0,1.9,41.0,33.0,,91.0,2.0,3.4,74.0,67.0,1.1,0.9
2017-01-01 07:00:00,18.0,2.8,0.51,0.19,15.0,14.0,29.0,2.9,60.0,36.0,,92.0,2.0,2.9,33.0,11.0,1.1,0.7
2017-01-01 08:00:00,19.0,2.5,0.54,0.18,18.0,14.0,32.0,5.0,50.0,41.0,,93.0,1.9,2.7,49.0,40.0,1.5,1.1
2017-01-01 09:00:00,20.0,2.2,0.53,0.16,11.0,15.0,27.0,13.0,74.0,46.0,,93.0,3.0,2.4,51.0,47.0,1.9,1.2
