In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_rows', 20)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [3]:
AQI_FILEPATH = os.path.join(os.curdir, 'data', 'aqi.csv')

df = pd.read_csv(filepath_or_buffer=AQI_FILEPATH, parse_dates=['datetime'], index_col='datetime', na_values='NR')
df_bak = df.copy()

# 保留降雨量為 NR 值的資料
df_withnr = pd.read_csv(filepath_or_buffer=AQI_FILEPATH, parse_dates=['datetime'], index_col='datetime')
df_withnr_bak = df_withnr.copy()

# 保留還未將 datetime 設為 index 的資料
df_noindex = pd.read_csv(filepath_or_buffer=AQI_FILEPATH)
df_noindex_bak = df_noindex.copy()

# 讀取未處理過的原始資料
AQI_ORIG_FILEPATH = os.path.join(os.curdir, 'data', 'aqi_original.csv')

df_orig = pd.read_csv(AQI_ORIG_FILEPATH)
df_orig_bak = df_orig.copy()

# 讀取還未使用 pivot_table() 處理過的資料
AQI_NOPIVOT_FILEPATH = os.path.join(os.curdir, 'data', 'aqi_nopivot.csv')

df_nopivot = pd.read_csv(AQI_NOPIVOT_FILEPATH, na_values=['NR'])
df_nopivot.replace(r'[-]?\D*[.]?\D*[#*x]+', np.nan, regex=True, inplace=True)
df_nopivot.loc[:, 'value'] = df_nopivot.loc[:, 'value'].astype(np.float64)
df_nopivot_bak = df_orig.copy()

In [4]:
df.head()

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5


# 第 2 章：觀察資料

在 Pandas 中，資料的基本物件有兩種，分別是一維 (one-dimension) 的 `Series` 與二維 (two-dimension) 的 `DataFrame`。我們在上個章節已經將資料讀取成為 DataFrame 物件了，接著就帶大家操作相關的功能，來了解資料本身的樣態。

以下我們將介紹 Pandas 中，最基礎的兩個物件：`DataFrame` 和 `Series` 的相關操作，但會以 DataFrame 為主。大多數的操作都可以套用在 DataFrame 和 Series 上面，不妨自行嘗試看看。

參考文件：

* [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
* [pandas.Series](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html)
* [10 Minutes to pandas | Object Creation](http://pandas.pydata.org/pandas-docs/stable/10min.html#object-creation)
* [Intro to Data Structures](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro)

## 觀察資料

### `.head()` : 取得前 n Rows 的資料

`.head()` function 將回傳前 n 個 rows 的資料，預設為前 5 個 rows。

* 常用 Parameters : 
     * n：回傳 row 的數量，預設為 5
* 參考文件：
    * [pandas.DataFrame.head](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.head.html)
    * [pandas.Series.head](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.head.html)



In [5]:
# 取得前 5 個 rows 的資料
df.head()

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5


In [6]:
# 取得前 10 個 rows 的資料
df.head(10)

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5
2017-01-01 05:00:00,17.0,3.6,0.37,0.13,8.4,12.0,20.0,2.0,57.0,28.0,,91.0,2.0,3.7,79.0,114.0,0.8,0.7
2017-01-01 06:00:00,18.0,3.3,0.42,0.13,8.1,12.0,20.0,1.9,41.0,33.0,,91.0,2.0,3.4,74.0,67.0,1.1,0.9
2017-01-01 07:00:00,18.0,2.8,0.51,0.19,15.0,14.0,29.0,2.9,60.0,36.0,,92.0,2.0,2.9,33.0,11.0,1.1,0.7
2017-01-01 08:00:00,19.0,2.5,0.54,0.18,18.0,14.0,32.0,5.0,50.0,41.0,,93.0,1.9,2.7,49.0,40.0,1.5,1.1
2017-01-01 09:00:00,20.0,2.2,0.53,0.16,11.0,15.0,27.0,13.0,74.0,46.0,,93.0,3.0,2.4,51.0,47.0,1.9,1.2


### `.tail()` : 取得倒數 n Rows 的資料

`.tail()` function 將回傳倒數 n 個 rows 的資料，預設為倒數 5 個 rows。

* 常用 Parameters : 
     * n：回傳 row 的數量，預設為 5
* 參考文件：
    * [pandas.DataFrame.tail](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.tail.html)
    * [pandas.Series.tail](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.tail.html)


In [7]:
# 取得倒數 5 個 rows 的資料
df.tail()

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-12-31 19:00:00,17.0,2.0,0.57,0.11,0.9,16.0,17.0,25.0,88.0,53.0,,73.0,3.8,2.1,11.0,11.0,5.6,5.6
2017-12-31 20:00:00,17.0,2.0,0.53,0.1,0.9,18.0,19.0,23.0,82.0,48.0,,73.0,4.8,2.1,11.0,11.0,4.5,5.0
2017-12-31 21:00:00,17.0,1.9,0.49,0.09,0.9,17.0,18.0,25.0,91.0,44.0,,73.0,4.3,2.0,12.0,14.0,3.8,4.6
2017-12-31 22:00:00,17.0,1.9,0.46,0.08,0.9,20.0,21.0,21.0,82.0,41.0,,72.0,6.0,2.0,14.0,15.0,4.2,4.0
2017-12-31 23:00:00,17.0,1.9,0.45,0.06,1.0,20.0,21.0,21.0,77.0,35.0,,70.0,6.8,2.0,16.0,19.0,3.8,3.9


In [8]:
# 取得倒數 10 個 rows 的資料
df.tail(10)

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-12-31 14:00:00,19.0,2.1,0.78,0.12,1.5,11.0,12.0,46.0,213.0,95.0,,69.0,2.3,2.2,12.0,13.0,8.6,7.6
2017-12-31 15:00:00,19.0,2.1,0.78,0.13,1.7,13.0,15.0,43.0,176.0,95.0,,69.0,3.4,2.2,11.0,8.4,6.5,6.4
2017-12-31 16:00:00,18.0,2.1,0.75,0.13,1.6,16.0,17.0,38.0,163.0,94.0,,71.0,3.7,2.2,9.2,10.0,6.5,6.5
2017-12-31 17:00:00,18.0,2.0,0.72,0.13,0.8,17.0,18.0,32.0,125.0,82.0,,72.0,4.0,2.2,11.0,15.0,7.0,6.2
2017-12-31 18:00:00,18.0,2.0,0.65,0.12,0.7,16.0,17.0,29.0,95.0,64.0,,72.0,3.8,2.1,11.0,8.5,6.8,6.2
2017-12-31 19:00:00,17.0,2.0,0.57,0.11,0.9,16.0,17.0,25.0,88.0,53.0,,73.0,3.8,2.1,11.0,11.0,5.6,5.6
2017-12-31 20:00:00,17.0,2.0,0.53,0.1,0.9,18.0,19.0,23.0,82.0,48.0,,73.0,4.8,2.1,11.0,11.0,4.5,5.0
2017-12-31 21:00:00,17.0,1.9,0.49,0.09,0.9,17.0,18.0,25.0,91.0,44.0,,73.0,4.3,2.0,12.0,14.0,3.8,4.6
2017-12-31 22:00:00,17.0,1.9,0.46,0.08,0.9,20.0,21.0,21.0,82.0,41.0,,72.0,6.0,2.0,14.0,15.0,4.2,4.0
2017-12-31 23:00:00,17.0,1.9,0.45,0.06,1.0,20.0,21.0,21.0,77.0,35.0,,70.0,6.8,2.0,16.0,19.0,3.8,3.9


### `.shape`：回傳 rows 數量 和 columns 數量

回傳一個標示資料形狀的 tuple。

參考文件：

* [pandas.DataFrame.shape](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shape.html)
* [pandas.Series.shape](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.shape.html)

In [9]:
df.shape
# 結果說明：
# 回傳的第一個元素為 Row 的數量
# 第二個元素則為 Column 的數量
# 兩者皆包含數值、非數值 (NaN)、以及空值

(8760, 18)

In [10]:
type(df[['NO']])

pandas.core.frame.DataFrame

In [11]:
s = df['NO']

In [12]:
type(s)

pandas.core.series.Series

In [13]:
s.shape

(8760,)

### `.index`：回傳 Row Labels

回傳所有 index (row labels)。

參考文件：

* [pandas.DataFrame.index](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.index.html)
* [pandas.Series.index](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.index.html)

In [14]:
# 檢視資料的 Index
df.index

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:00:00',
               '2017-01-01 02:00:00', '2017-01-01 03:00:00',
               '2017-01-01 04:00:00', '2017-01-01 05:00:00',
               '2017-01-01 06:00:00', '2017-01-01 07:00:00',
               '2017-01-01 08:00:00', '2017-01-01 09:00:00',
               ...
               '2017-12-31 14:00:00', '2017-12-31 15:00:00',
               '2017-12-31 16:00:00', '2017-12-31 17:00:00',
               '2017-12-31 18:00:00', '2017-12-31 19:00:00',
               '2017-12-31 20:00:00', '2017-12-31 21:00:00',
               '2017-12-31 22:00:00', '2017-12-31 23:00:00'],
              dtype='datetime64[ns]', name='datetime', length=8760, freq=None)

In [15]:
s.index

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:00:00',
               '2017-01-01 02:00:00', '2017-01-01 03:00:00',
               '2017-01-01 04:00:00', '2017-01-01 05:00:00',
               '2017-01-01 06:00:00', '2017-01-01 07:00:00',
               '2017-01-01 08:00:00', '2017-01-01 09:00:00',
               ...
               '2017-12-31 14:00:00', '2017-12-31 15:00:00',
               '2017-12-31 16:00:00', '2017-12-31 17:00:00',
               '2017-12-31 18:00:00', '2017-12-31 19:00:00',
               '2017-12-31 20:00:00', '2017-12-31 21:00:00',
               '2017-12-31 22:00:00', '2017-12-31 23:00:00'],
              dtype='datetime64[ns]', name='datetime', length=8760, freq=None)

### `.columns` ： 回傳 Column Labels

透過這個屬性以**取得**、或**修改** Column 的內容。

參考文件：

* [pandas.DataFrame.columns](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.columns.html)

In [16]:
# 檢視資料的 Column
df.columns

Index(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10',
       'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC',
       'WIND_SPEED', 'WS_HR'],
      dtype='object')

### `.info()` : 回傳 DataFrame 的彙整資訊

調用 DataFrame 的 `.info()` 這個 method 會回傳包含了

* `index dtype`：Index 的資料型態
* `column dtypes`：Column 的資料型態
* `non-null values`：非空值的個數
* `memory usage`：物件占用多少記憶體空間

等等的資訊。

參考文件：[pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html)

In [17]:
# 檢視資料的彙整資訊
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2017-01-01 00:00:00 to 2017-12-31 23:00:00
Data columns (total 18 columns):
AMB_TEMP      8733 non-null float64
CH4           8662 non-null float64
CO            8662 non-null float64
NMHC          8662 non-null float64
NO            8554 non-null float64
NO2           8554 non-null float64
NOx           8554 non-null float64
O3            8681 non-null float64
PM10          8625 non-null float64
PM2.5         8576 non-null float64
RAINFALL      415 non-null float64
RH            8732 non-null float64
SO2           8648 non-null float64
THC           8662 non-null float64
WD_HR         8727 non-null float64
WIND_DIREC    8732 non-null float64
WIND_SPEED    8732 non-null float64
WS_HR         8727 non-null float64
dtypes: float64(18)
memory usage: 1.3 MB


### `.describe()` : 回傳敘述性統計分析的結果

調用 `describe()` method 以製作一份資料的敘述性統計分析，是用來觀察資料分布的技巧。

敘述性統計的分析結果通常還會搭配一些視覺化的工具來輔助觀察，這些工具將在後面的小節中提及。

* 常用 Parameter：
    * `include`：可以指定一連串 dtype 的白名單，讓敘述性統計分析包含這些型態的 column，或是指定 `all` 來包含所有的 column
* 參考文件：
    * [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html)
    * [pandas.Series.describe](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.describe.html)
    * [Wikipedia | 描述統計學](https://zh.wikipedia.org/zh-tw/描述統計學)

In [18]:
# 我們用 df_noindex 來觀察 DataFrame 中所有 column 的資料分布
# 此時會發現部分資料為 NaN，因為各個 column 的 dtype 不同
# 其所可以計算的統計量也不同，無法計算的便會顯示為 NaN
df_noindex.describe(include='all')

Unnamed: 0,datetime,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
count,8760,8733.0,8662.0,8662.0,8662.0,8554.0,8554.0,8554.0,8681.0,8625.0,8576.0,8713,8732.0,8648.0,8662.0,8727.0,8732.0,8732.0,8727.0
unique,8760,,,,,,,,,,,65,,,,,,,
top,2017-04-11 00:00:00,,,,,,,,,,,NR,,,,,,,
freq,1,,,,,,,,,,,8298,,,,,,,
mean,,24.590633,2.034242,0.28198,0.072598,2.197697,9.169885,11.363841,31.892605,70.665159,27.819263,,78.943999,2.953423,2.105518,111.034743,113.507639,3.393404,2.495462
std,,5.065055,0.33477,0.13375,0.077559,2.084884,5.037761,6.142313,17.085349,48.478237,15.27443,,9.033272,1.877609,0.366224,101.564899,102.146535,2.139149,1.829496
min,,13.0,1.7,0.02,-0.03,-0.5,0.7,1.3,1.0,9.0,2.0,,29.0,0.0,1.7,0.0,0.0,0.4,0.0
25%,,20.0,1.9,0.19,0.04,1.1,5.5,7.1,19.0,42.0,18.0,,73.0,1.8,1.9,24.5,27.0,1.7,1.1
50%,,25.0,1.9,0.26,0.06,1.7,8.0,9.9,30.0,59.0,24.0,,80.0,2.4,2.0,68.0,71.0,2.9,2.0
75%,,29.0,2.1,0.35,0.09,2.6,12.0,14.0,43.0,83.0,35.0,,86.0,3.5,2.2,192.0,196.0,4.7,3.3


In [19]:
# 觀察 DataFrame 中，dtype 為數值的 column 的資料分布
# dtype 為 float 或 int 時，使用 .describe() method，會回傳 8 個統計量
df_noindex.describe(include=np.number)

Unnamed: 0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
count,8733.0,8662.0,8662.0,8662.0,8554.0,8554.0,8554.0,8681.0,8625.0,8576.0,8732.0,8648.0,8662.0,8727.0,8732.0,8732.0,8727.0
mean,24.590633,2.034242,0.28198,0.072598,2.197697,9.169885,11.363841,31.892605,70.665159,27.819263,78.943999,2.953423,2.105518,111.034743,113.507639,3.393404,2.495462
std,5.065055,0.33477,0.13375,0.077559,2.084884,5.037761,6.142313,17.085349,48.478237,15.27443,9.033272,1.877609,0.366224,101.564899,102.146535,2.139149,1.829496
min,13.0,1.7,0.02,-0.03,-0.5,0.7,1.3,1.0,9.0,2.0,29.0,0.0,1.7,0.0,0.0,0.4,0.0
25%,20.0,1.9,0.19,0.04,1.1,5.5,7.1,19.0,42.0,18.0,73.0,1.8,1.9,24.5,27.0,1.7,1.1
50%,25.0,1.9,0.26,0.06,1.7,8.0,9.9,30.0,59.0,24.0,80.0,2.4,2.0,68.0,71.0,2.9,2.0
75%,29.0,2.1,0.35,0.09,2.6,12.0,14.0,43.0,83.0,35.0,86.0,3.5,2.2,192.0,196.0,4.7,3.3
max,34.0,8.6,1.84,3.09,38.0,38.0,65.0,105.0,549.0,121.0,93.0,43.0,8.6,360.0,360.0,12.0,10.0


In [20]:
df_noindex.describe(include=float)

Unnamed: 0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
count,8733.0,8662.0,8662.0,8662.0,8554.0,8554.0,8554.0,8681.0,8625.0,8576.0,8732.0,8648.0,8662.0,8727.0,8732.0,8732.0,8727.0
mean,24.590633,2.034242,0.28198,0.072598,2.197697,9.169885,11.363841,31.892605,70.665159,27.819263,78.943999,2.953423,2.105518,111.034743,113.507639,3.393404,2.495462
std,5.065055,0.33477,0.13375,0.077559,2.084884,5.037761,6.142313,17.085349,48.478237,15.27443,9.033272,1.877609,0.366224,101.564899,102.146535,2.139149,1.829496
min,13.0,1.7,0.02,-0.03,-0.5,0.7,1.3,1.0,9.0,2.0,29.0,0.0,1.7,0.0,0.0,0.4,0.0
25%,20.0,1.9,0.19,0.04,1.1,5.5,7.1,19.0,42.0,18.0,73.0,1.8,1.9,24.5,27.0,1.7,1.1
50%,25.0,1.9,0.26,0.06,1.7,8.0,9.9,30.0,59.0,24.0,80.0,2.4,2.0,68.0,71.0,2.9,2.0
75%,29.0,2.1,0.35,0.09,2.6,12.0,14.0,43.0,83.0,35.0,86.0,3.5,2.2,192.0,196.0,4.7,3.3
max,34.0,8.6,1.84,3.09,38.0,38.0,65.0,105.0,549.0,121.0,93.0,43.0,8.6,360.0,360.0,12.0,10.0


In [21]:
# 觀察 DataFrame 中 dtype 為 object 的 column 的資料分布
# dtype 為 object 時，使用 .describe() method，會回傳 4 個統計量
# count: 排除 NaN 之後的資料筆數
# unique: 不重複資料有幾種
# top: 不重複資料中占比最高的
# freq: 不重複資料中占比最高的所佔筆數
df_noindex.describe(include=np.object)

Unnamed: 0,datetime,RAINFALL
count,8760,8713
unique,8760,65
top,2017-04-11 00:00:00,NR
freq,1,8298


In [22]:
# 另一個例子：觀察風向（Column 名稱：'WIND_DIREC'）的資料分布
df['WIND_DIREC'].astype('str').describe()

count     8760
unique     452
top       11.0
freq       249
Name: WIND_DIREC, dtype: object

### `.dtypes` : 回傳每一個 Column 的 Data type

若是調用 `DataFrame.dtypes`，將回傳一個帶有所有 column 的 data type 的 Series，調用 `Series.dtypes`（或是 `Series.dtype`），將回傳 data type object。調用這些 Attribute 所得到的結果與 `.info()` 中各 `columns dtype` 項目是相同的。

Data types 的說明則請參閱 [Numpy 的官方文件](https://docs.scipy.org/doc/numpy/user/basics.types.html)。

參考文件：

* [pandas.DataFrame.dtypes](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dtypes.html)
* [pandas.Series.dtype](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.dtype.html)
* [Data types](https://docs.scipy.org/doc/numpy/user/basics.types.html)


In [23]:
df.dtypes

AMB_TEMP      float64
CH4           float64
CO            float64
NMHC          float64
NO            float64
NO2           float64
NOx           float64
O3            float64
PM10          float64
PM2.5         float64
RAINFALL      float64
RH            float64
SO2           float64
THC           float64
WD_HR         float64
WIND_DIREC    float64
WIND_SPEED    float64
WS_HR         float64
dtype: object

### `.unique()`, `.nunique()`：檢視無重複的資料

觀察類別資料時，我們常常會想到要觀察「**不重複**的所有資料」，這時可以對 Series 調用 `unique()` 或是 `nunique()`（排除 `NaN` 值）method。

如果還要計算出各類別的個數，還可以使用 `.value_counts()` method。

In [24]:
# 理論上是直接對 Series 調用 .unique() Method 就好
df.loc[:, 'WIND_DIREC'].unique()

array([3.70e+01, 1.43e+02, 1.42e+02, 5.10e+01, 1.06e+02, 1.14e+02,
       6.70e+01, 1.10e+01, 4.00e+01, 4.70e+01, 3.30e+01, 2.86e+02,
       6.50e+01, 9.20e+01, 1.10e+02, 2.69e+02, 1.38e+02, 7.80e+01,
       1.78e+02, 4.20e+01, 4.40e+01, 3.17e+02, 4.80e+01, 3.60e+01,
       5.40e+01, 1.15e+02, 3.39e+02, 3.50e+01, 4.90e+01, 2.80e+02,
       2.67e+02, 1.03e+02, 4.60e+01, 7.30e+01, 5.70e+01, 6.10e+01,
       7.50e+01, 5.90e+01, 1.08e+02, 5.00e+01, 3.80e+01, 7.20e+01,
       7.70e+01, 2.90e+01,      nan, 1.16e+02, 2.15e+02, 1.27e+02,
       1.71e+02, 1.35e+02, 6.80e+01, 6.60e+01, 6.20e+01, 4.10e+01,
       7.40e+01, 6.30e+01, 4.50e+01, 2.25e+02, 2.26e+02, 2.53e+02,
       2.45e+02, 1.80e+02, 3.40e+01, 3.10e+01, 3.31e+02, 8.70e+01,
       9.90e+01, 1.74e+02, 1.56e+02, 1.48e+02, 1.54e+02, 1.87e+02,
       1.49e+02, 2.37e+02, 2.66e+02, 2.77e+02, 2.41e+02, 2.84e+02,
       2.64e+02, 9.40e+01, 7.90e+00, 5.50e+01, 3.59e+02, 5.20e+01,
       2.56e+02, 2.93e+02, 1.00e+02, 1.50e+02, 1.12e+02, 3.90e

In [25]:
# 但實際上在執行時，因為輸出結果是浮點數（表示方式會受到 IEEE 754 影響）、
# 以及物件為 Numpy ndarray 的關係，結果會帶科學記號，
# 會很難觀察數值，這時可以選擇轉換為 Series
wind_direc_s = pd.Series(df.loc[:, 'WIND_DIREC'].unique())
wind_direc_s.sort_values().reset_index(drop=True).head()

0    0.0
1    0.1
2    0.2
3    0.3
4    0.4
dtype: float64

In [26]:
# 觀察最長測到的風向角度的前十名
df.loc[:, 'WIND_DIREC'].value_counts().head(10)

11.0    249
12.0    183
13.0    170
10.0    125
14.0    104
15.0     68
39.0     65
42.0     64
61.0     62
65.0     62
Name: WIND_DIREC, dtype: int64

### `.value_counts()`：計算 series 中每一個數值的出現次數，並回傳由大到小排序的 series

`value_counts()` method 會回傳 Series 中，每個數值出現的次數的 Series。

預設的排序方式是以數值的大小做降冪排序，如果想要更改排序的方式，可以搭配 `sort_values()` 或是 `sort_index()` method。

* 參考文件：
    * [pandas.Series.value_counts](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html)
    * [pandas.Series.sort_index](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort_index.html)
    * [pandas.Series.sort_values](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort_values.html)

In [27]:
# 例：計算 PM2.5 中各數值出現次數
df['PM2.5'].value_counts()

20.0     573
18.0     295
22.0     292
19.0     291
27.0     284
17.0     277
23.0     274
15.0     268
24.0     261
21.0     259
        ... 
100.0      1
113.0      1
83.0       1
104.0      1
110.0      1
98.0       1
102.0      1
121.0      1
89.0       1
109.0      1
Name: PM2.5, Length: 109, dtype: int64

## 選擇資料

### `.loc` : 以 Row/Column Label 為查詢基礎來取得資料


調用 `.loc` attribute 以獲得想要觀察的資料。

`.loc` 是以 row/column 的 **Label** 作為查詢的基礎，如果調用的對象是 DataFrame，則可以傳入兩個維度的 Slice 標記，分別為 Row labels 和 Column labels，如果是 Series 則僅傳入 Row labels 就可以了。最後依據查詢的範圍不同，可能回傳 DataFrame, Series 或單一 object。

傳入的 labels 如果是 Iterable（可迭代的）object，或是用 slice 的標記格式傳入一個區間，當然也可以是單一物件。

* 參考文件：
    * [pandas.DataFrame.loc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.loc.html)
    * [pandas.Series.loc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.loc.html)
    * [10 Minutes to pandas | Selection](https://pandas.pydata.org/pandas-docs/stable/10min.html#selection)
    * [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing)


> 備註：
>
> 與一般 Python 語法在操作 Slice 時稍有不同的地方是：`.loc()` 的 Slice 是包含上界 lebal 的。
>
> 例：`df.loc[3:5, :]` 會回傳 Index label 為 `3, 4, 5` 的 rows。
>
> 不過 index = `0` 的時候，使用負數的 index 則會無法顯示 index = `0` 的資料
>
> 例：`df.loc[-3:, :]` 無法改寫為 `df.loc[-3:0, :]`

In [28]:
df.loc['2017-01-01':'2017-01-02', 'CO':'NO']

Unnamed: 0_level_0,CO,NMHC,NO
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01 00:00:00,0.20,0.04,1.7
2017-01-01 01:00:00,0.19,0.05,1.9
2017-01-01 02:00:00,0.24,0.08,2.0
2017-01-01 03:00:00,0.24,0.11,1.4
2017-01-01 04:00:00,0.23,0.12,6.8
2017-01-01 05:00:00,0.37,0.13,8.4
2017-01-01 06:00:00,0.42,0.13,8.1
2017-01-01 07:00:00,0.51,0.19,15.0
2017-01-01 08:00:00,0.54,0.18,18.0
2017-01-01 09:00:00,0.53,0.16,11.0


In [29]:
# 以 df 物件（已將 datetime 設為 index labels）為例
# 取得所有 row 以及 column 的資料
# 以下操作均等價
df.loc[:]  # Column label 可以省略
# df.loc[:, :]

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.20,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5
2017-01-01 05:00:00,17.0,3.6,0.37,0.13,8.4,12.0,20.0,2.0,57.0,28.0,,91.0,2.0,3.7,79.0,114.0,0.8,0.7
2017-01-01 06:00:00,18.0,3.3,0.42,0.13,8.1,12.0,20.0,1.9,41.0,33.0,,91.0,2.0,3.4,74.0,67.0,1.1,0.9
2017-01-01 07:00:00,18.0,2.8,0.51,0.19,15.0,14.0,29.0,2.9,60.0,36.0,,92.0,2.0,2.9,33.0,11.0,1.1,0.7
2017-01-01 08:00:00,19.0,2.5,0.54,0.18,18.0,14.0,32.0,5.0,50.0,41.0,,93.0,1.9,2.7,49.0,40.0,1.5,1.1
2017-01-01 09:00:00,20.0,2.2,0.53,0.16,11.0,15.0,27.0,13.0,74.0,46.0,,93.0,3.0,2.4,51.0,47.0,1.9,1.2


In [30]:
# 取得 PM2.5 (Column label: 'PM2.5')
# 在 2017-05-01 00:00:00 ~ 2017-05-01 05:00:00 的資料
# 以下操作雖不等價，但是回傳資料的意義相似
df.loc['2017-05-01 00:00:00':'2017-05-01 05:00:00', 'PM2.5']  # 回傳 Series
df.loc['2017-05-01 00:00:00':'2017-05-01 05:00:00', ('PM2.5')]  # 回傳 DataFrame
df.loc['2017-05-01 00:00:00':'2017-05-01 05:00:00', ['PM2.5']]  # 回傳 DataFrame

Unnamed: 0_level_0,PM2.5
datetime,Unnamed: 1_level_1
2017-05-01 00:00:00,49.0
2017-05-01 01:00:00,55.0
2017-05-01 02:00:00,58.0
2017-05-01 03:00:00,62.0
2017-05-01 04:00:00,61.0
2017-05-01 05:00:00,58.0


In [31]:
# 取得多個 column 在某一段區間的資料
# 例：取得臭氧、PM2.5、二氧化硫 (Column label: 'O3', 'PM2.5', 'SO2')
# 在 2017-05-01 00:00:00 ~ 2017-05-01 05:00:00 的資料
df.loc['2017-05-01 00:00:00':'2017-05-01 05:00:00', ['O3', 'PM2.5', 'SO2']]

Unnamed: 0_level_0,O3,PM2.5,SO2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-05-01 00:00:00,46.0,49.0,2.2
2017-05-01 01:00:00,40.0,55.0,2.4
2017-05-01 02:00:00,30.0,58.0,2.5
2017-05-01 03:00:00,21.0,62.0,2.6
2017-05-01 04:00:00,11.0,61.0,1.7
2017-05-01 05:00:00,7.9,58.0,1.6


In [32]:
# 以 df_noindex 物件（仍未將 datetime 設為 index labels）為例
# 取得總碳氫合物、非甲烷碳氫化合物 (Column label: 'THC', 'NMHC') 在第 3~7 個 row 的資料
df_noindex.loc[3:7, ['THC', 'NMHC']]

Unnamed: 0,THC,NMHC
3,2.5,0.11
4,3.6,0.12
5,3.7,0.13
6,3.4,0.13
7,2.9,0.19


### `.iloc` : 以 Row/Column 的數字位置索引值為查詢基礎以取得資料


調用 `.iloc` attribute 以取得想要觀察的資料。

`.iloc` 是以 Row/Column 的「數字位置索引值 (Integer-location based)」為查詢基礎，如果調用的對象是 DataFrame，則最多可以傳入兩個維度的 Slice 標記，分別為 Row indexes 和 Column indexes，如果是 Series 則僅傳入 Row indexes 就可以了。最後一句查詢的範圍不同，可能回傳 DataFrame, Series 或單一 object。

傳入的 labels 如果是 Iterable（可迭代的）object，或是用 slice 的標記格式傳入一個區間，當然也可以是單一物件。

如果想要使用 Row/Column 中的某個 label 的數字位置索引值來搭配 `.iloc` 來查詢資料的話，可以配合 `.get_loc()` method 來查詢（只是有點多此一舉）。

* 參考文件：
    * [pandas.DataFrame.iloc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.iloc.html)
    * [pandas.Series.iloc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.iloc.html)
    * [10 Minutes to pandas | Selection](https://pandas.pydata.org/pandas-docs/stable/10min.html#selection)
    * [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing)
    * [pandas.Index.get_loc](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.get_loc.html)

In [33]:
# 取得資料集中前 5 個 rows 以及倒數 3 個 columns 的資料
# 以下兩種操作等價
df.iloc[0:5, :]
# df.iloc[:5, -3:]

Unnamed: 0_level_0,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-01-01 00:00:00,20.0,2.0,0.2,0.04,1.7,7.5,9.2,25.0,67.0,18.0,,88.0,1.5,2.0,33.0,37.0,1.4,0.1
2017-01-01 01:00:00,20.0,2.2,0.19,0.05,1.9,7.9,9.8,18.0,52.0,14.0,,88.0,2.1,2.2,76.0,143.0,0.5,0.6
2017-01-01 02:00:00,19.0,2.2,0.24,0.08,2.0,9.3,11.0,13.0,59.0,17.0,,89.0,2.0,2.3,140.0,142.0,0.5,0.6
2017-01-01 03:00:00,19.0,2.4,0.24,0.11,1.4,9.5,11.0,8.5,53.0,24.0,,90.0,1.8,2.5,107.0,51.0,0.5,0.4
2017-01-01 04:00:00,18.0,3.4,0.23,0.12,6.8,11.0,17.0,1.8,37.0,25.0,,90.0,1.7,3.6,96.0,106.0,1.0,0.5


In [34]:
# 同上的例子，但是我們這次改用 iloc 來執行
# 也就是取得 Row label = 2017-01-01 00:00:00 ~ 2017-01-01 04:00:00
# Column label = 'WIND_DIREC', 'WIND_SPEED', 'WS_HR' 的資料

In [35]:
# 先取得 'WIND_DIREC', 'WIND_SPEED', 'WS_HR' 的數字位置索引值
column_indexes = []
column_indexes.append(df.columns.get_loc('WIND_DIREC'))
column_indexes.append(df.columns.get_loc('WIND_SPEED'))
column_indexes.append(df.columns.get_loc('WS_HR'))
# 以上操作與底下等價
# column_indexes = [df.columns.get_loc(col) for col in ['WIND_DIREC', 'WIND_SPEED', 'WS_HR']]

In [36]:
# 再取得 2017-01-01 00:00:00 ~ 2017-01-01 04:00:00 的數字位置索引值
row_index_start = df.index.get_loc('2017-01-01 00:00:00')
row_index_end = df.index.get_loc('2017-01-01 04:00:00')

In [37]:
# 取得資料（想一想為何 row_index_end 要加一？）
df.iloc[row_index_start: row_index_end+1, column_indexes]

Unnamed: 0_level_0,WIND_DIREC,WIND_SPEED,WS_HR
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01 00:00:00,37.0,1.4,0.1
2017-01-01 01:00:00,143.0,0.5,0.6
2017-01-01 02:00:00,142.0,0.5,0.6
2017-01-01 03:00:00,51.0,0.5,0.4
2017-01-01 04:00:00,106.0,1.0,0.5
