In [1]:
import numpy as np
import pandas as pd
from numpy import nan as NA

# 7.1處理遺失資料

* pandas使用了浮點數值NaN(Not a Number)來代表遺失資料 -> 稱為標記值

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

* Python內建的None也會被當成NA(R 語言中的not available)

In [5]:
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## NA處理方法
|參數|描述|
|:-:|:-:|
|dropna|以指定軸上的標籤為單位，過濾每個指定的標籤是否有遺失資料值，對於遺失資料的容忍有不同的限定值|
|fillna|將遺失資料已指定的值取代，或使用內插值方法如'ffill'或'bfill'|
|isnull|回傳布林值，指出目標是否為遺失值|
|notnull|isnull的反向函式|

## 過濾遺失值

* 對Series來說，dropna會回傳非遺失資料與他所屬的index

In [7]:
from numpy import nan as NA

In [4]:
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [3]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [5]:
# 等校於上面寫法
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

* 對DataFrame來說，dropna預設只要列裡有缺失值就會濾除該列

In [6]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


* 指定how='all'的話，只會濾除全部為NA的列

In [9]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


* 傳入axis=1 -> 改為對欄做一樣的效果

In [11]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [13]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


* 過濾DataFrame時常會需要考慮時間序列資料，可以使用thresh參數指定

In [14]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,1.02642,0.076882,-0.139649
1,-0.689907,0.36006,1.731116
2,-0.579787,1.162763,-0.002905
3,1.095541,1.235842,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


In [16]:
df.iloc[:4, 1] = NA
df

Unnamed: 0,0,1,2
0,1.02642,,-0.139649
1,-0.689907,,1.731116
2,-0.579787,,-0.002905
3,1.095541,,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


In [18]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,1.02642,,
1,-0.689907,,
2,-0.579787,,-0.002905
3,1.095541,,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


In [21]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.579787,,-0.002905
3,1.095541,,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


## 為缺失值填值

* 可使用fillna將要用的常數取代值傳入

In [22]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.02642,0.0,0.0
1,-0.689907,0.0,0.0
2,-0.579787,0.0,-0.002905
3,1.095541,0.0,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


* 可使用dict型態來指定不同欄填入不同值

In [32]:
df.fillna(value={1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,0.5,1.590113
3,0.578254,0.5,-0.857108
4,1.09605,0.5,0.0
5,-1.86486,0.5,0.0


* fillna執行完後，會傳回一個物件，也可以使用in-place來修改原資料

In [24]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,1.02642,0.0,0.0
1,-0.689907,0.0,0.0
2,-0.579787,0.0,-0.002905
3,1.095541,0.0,0.605998
4,-0.722519,-2.509582,-0.599268
5,-2.683757,-0.21915,1.014237
6,-0.23387,-0.306056,1.333987


* 重作索引中的內插法，在fillna中也可使用

In [25]:
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,-1.376318,1.590113
3,0.578254,2.259484,-0.857108
4,1.09605,0.104948,1.30614
5,-1.86486,-1.316316,0.372358


In [26]:
df.iloc[2:, 1] = NA
df

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,,1.590113
3,0.578254,,-0.857108
4,1.09605,,1.30614
5,-1.86486,,0.372358


In [27]:
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,,1.590113
3,0.578254,,-0.857108
4,1.09605,,
5,-1.86486,,


In [28]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,0.347715,1.590113
3,0.578254,0.347715,-0.857108
4,1.09605,0.347715,-0.857108
5,-1.86486,0.347715,-0.857108


In [29]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.56424,-0.0867,1.693999
1,-1.534388,0.347715,-0.77925
2,1.01933,0.347715,1.590113
3,0.578254,0.347715,-0.857108
4,1.09605,,-0.857108
5,-1.86486,,-0.857108


* 可使用fillna將平均值或中位數傳到一個Series中

In [30]:
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [31]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## fillna函式參數
|參數|描述|
|:-:|:-:|
|value|用來填充缺失值的常數或類dict物件|
|method|內插的方法，如果函式沒有使用其他參數的話，預設是'ffill'|
|axis|要填充的軸向，預設為axis=0|
|inplace|直接修改呼叫的物件，不產出複製資料|
|limit|向前或向後填充時，最大的填充數|

## 7.2資料轉換

## 移除重複值

In [33]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


* DataFrame的duplicated方法會回傳一個布林Series，用來指出每個列有沒有重複值(與前面的列相比)

In [34]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

* drop_duplicates -> 直接回傳DataFrame，內容是duplicated中標示為False的資料

In [35]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


* 上面兩種方法都會對全部資料做檢查
* 可在duplicated內中放入欄名稱來進行部分檢查

In [37]:
# 增加新的一欄
data['v1'] = range(7)

In [38]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [40]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


* duplicated和drop_duplicates預設會保留第一個看到的重複值，若設定keep='last' -> 會保留使用最後一個重複值

In [42]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 使用函示或對應關係轉換值

* 使用某些資料集合時，可能會需要使用某一欄的值，對於資料集合進行某種轉換

In [6]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 
                              'Pastrami', 'corned beef', 'Bacon', 
                              'pastrami', 'honey ham', 'nova lox'], 
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


* 接下來為這些肉品新增相對應的動物對照表

In [7]:
meat_to_animal = {'bacon': 'pig', 
                  'pulled pork': 'pig', 
                  'pastrami': 'cow', 
                  'corned beef': 'cow', 
                  'honey ham': 'pig', 
                  'nova lox': 'salmon'}

* 使用str.lower()將肉品名稱統一為小寫

In [8]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [9]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


* 只傳遞單一函數就可以完成所有工作的方法

In [11]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## 取代值

In [12]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

* 使用pandas的replace函數將-999取代為NA值(除非指定in-place，不然會回傳一個新的Series)

In [13]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

* 如果需取代多個值，可以傳入一個list，後面再加上要替換的值

In [14]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

* 若是想對不同的目標值使用不同的替換值，也可以傳入替換值list

In [15]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

* 也可以使用dict型態傳入

In [16]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

* <font color=yellow>data.replace與data.str.replace不同，data.str.replace是做元素級的字串轉換</font>

## 更名軸index

* 與Series一樣，軸標籤也可以藉由函式或對應進行轉換，產出帶有新標籤的另一個新物件
* 你也可以in-place修改，不建立新的資料結構

In [18]:
data = pd.DataFrame(np.arange(12).reshape(3, 4), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


* 軸的index物件跟Series一樣，都有一個較map的方法

In [19]:
transform = lambda x: x[:4].upper()

In [20]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

* 可以對index給值，就是in-place修改同一個DataFrame物件

In [21]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


* 如果不想動到原來的東西，但想建立一個新轉換過的index的資料集合 -> 可以使用rename

In [22]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [23]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


* rename還可以搭配dict物件，取代部分軸標籤

In [24]:
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


* rename可以節省手動複製DataFrame和指定index和columns屬性的時間，若想in-place的修改一個資料集，就指定inplace=True

In [26]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 離散化和分組

* 將連續性的資料作離散化或分組

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

* 將年紀分作18-25，26-35，36-60，61以上 -> 可使用pandas中的cut

In [3]:
bins = [18, 25, 35, 60, 100]
bins

[18, 25, 35, 60, 100]

In [4]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

* 此方法會回傳一個特殊的Categorical物件，代表pandas.cut計算完後的分組 -> 可以當作分組名稱字串的陣列，內部是個Categorical陣列，陣列內容是各個分組名稱及ages資料的index

* 可在codes屬性忠看到categorical陣列

In [5]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

* pd.value_counts(cats)顯示pandas.cut分組以後每組數量
* 小括號代表該端open，中括號代表closed -> 可以藉由指定right=False來改變包含的一端

In [9]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

* 還可以把分組取名，只要將名稱list或陣列指定給labels選項

In [10]:
groups_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
groups_names

['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [11]:
pd.cut(ages, bins, labels=groups_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

* 如果傳給cut的不是分組區間，而是分組組數的話 -> 會用資料中最小值與最大值中間計算等距分組

* 建立一個平均分布的資料

In [3]:
data = np.random.rand(20)
data

array([0.83707387, 0.49701675, 0.70091061, 0.73469336, 0.71070398,
       0.30453048, 0.47482564, 0.61955273, 0.344471  , 0.28167099,
       0.30145561, 0.60209981, 0.86814477, 0.39660581, 0.57094237,
       0.16216915, 0.14683486, 0.65271775, 0.27888129, 0.53089894])

* 將資料切為4組，且取到小數下第二位(precision)

In [4]:
pd.cut(data, 4, precision=2)

[(0.69, 0.87], (0.33, 0.51], (0.69, 0.87], (0.69, 0.87], (0.69, 0.87], ..., (0.15, 0.33], (0.15, 0.33], (0.51, 0.69], (0.15, 0.33], (0.51, 0.69]]
Length: 20
Categories (4, interval[float64, right]): [(0.15, 0.33] < (0.33, 0.51] < (0.51, 0.69] < (0.69, 0.87]]

* qcut函式 -> 依照樣本數量分組 -> cut會因為資料分布不同而造成每組資料數量也不同 -> qcut切完後每組數量會一致

In [5]:
data = np.random.randn(1000)  # 常態分布
data

array([ 1.08374645e+00,  5.42387120e-01,  1.52236549e+00, -1.40396917e+00,
        3.79497567e-01, -2.25949222e+00, -1.01892399e+00, -1.59679635e+00,
        2.14781866e-01, -2.56516172e-01, -2.32680828e-01,  6.19626881e-01,
        7.33911209e-01,  1.49500222e+00, -1.90249110e-01, -1.95227598e+00,
        9.72818052e-01,  9.84324588e-02, -7.90833359e-02,  1.41347757e+00,
       -2.99909322e+00, -1.45105877e+00,  6.68546453e-01, -5.12179921e-01,
       -1.32211587e+00,  6.74745301e-02, -1.12031380e-01, -2.48946678e-01,
        1.70259909e-01,  1.73775982e+00,  2.03801789e+00, -1.28442056e+00,
       -7.62959330e-01,  2.59737387e-01, -4.42380443e-01,  1.27777644e+00,
        3.76529974e-02,  1.60188609e-01, -1.58082065e+00,  4.14036096e-01,
       -7.96881061e-01, -6.72768773e-01, -1.50639746e+00,  5.96331895e-01,
       -1.12524321e+00,  3.62407201e-01, -8.23528013e-01,  1.57459941e+00,
        8.17973597e-01,  1.48218328e+00,  9.95256244e-01, -8.16666400e-01,
        5.64017176e-01, -

In [6]:
cats = pd.qcut(data, 4)  # 切成4份
cats

[(0.662, 3.27], (0.0398, 0.662], (0.662, 3.27], (-3.004, -0.69], (0.0398, 0.662], ..., (0.662, 3.27], (0.662, 3.27], (0.662, 3.27], (-3.004, -0.69], (-3.004, -0.69]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.004, -0.69] < (-0.69, 0.0398] < (0.0398, 0.662] < (0.662, 3.27]]

In [7]:
pd.value_counts(cats)

(-3.004, -0.69]    250
(-0.69, 0.0398]    250
(0.0398, 0.662]    250
(0.662, 3.27]      250
dtype: int64

* 與cut一樣，qcut也可以自訂每組資料量(0~1之間的數值，包含1)

In [9]:
cats2 = pd.cut(data, [0, 0.1, 0.5, 0.9, 1.])
cats2

[NaN, (0.5, 0.9], NaN, NaN, (0.1, 0.5], ..., NaN, (0.5, 0.9], NaN, NaN, NaN]
Length: 1000
Categories (4, interval[float64, right]): [(0.0, 0.1] < (0.1, 0.5] < (0.5, 0.9] < (0.9, 1.0]]

In [10]:
pd.value_counts(cats2)

(0.1, 0.5]    167
(0.5, 0.9]    123
(0.0, 0.1]     36
(0.9, 1.0]     24
dtype: int64

* <font color=yellow>離散化函式對分位數和分群分析很有用</font>

## 偵測和濾除離群值

* 陣列上運算過濾和轉換離群值(transforming outlier)

In [45]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,0.881765,0.453094,1.064989,0.002880
1,0.360947,0.842893,-0.012153,1.099085
2,-0.869746,-0.428300,-1.084349,-0.249849
3,-1.792949,0.868327,1.124309,1.500495
4,0.807443,-1.207001,-1.740065,0.816218
...,...,...,...,...
995,1.029378,1.117399,-0.847885,0.617837
996,2.863468,2.223398,0.812480,0.791775
997,1.214432,1.500334,-2.181251,0.232210
998,-1.509985,-0.253552,0.280684,-0.366339


In [46]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.03392,0.005039,-0.065612,0.008139
std,1.00642,0.97713,0.999493,0.989325
min,-3.149599,-3.76834,-3.509485,-3.480049
25%,-0.718333,-0.639726,-0.735754,-0.658675
50%,0.044885,-0.026683,-0.051145,0.022187
75%,0.662671,0.646198,0.635769,0.679859
max,2.891698,3.21604,3.271069,2.90658


* 找出指定欄位中，絕對值大於3的值

In [47]:
col = data[2]
col

0      1.064989
1     -0.012153
2     -1.084349
3      1.124309
4     -1.740065
         ...   
995   -0.847885
996    0.812480
997   -2.181251
998    0.280684
999   -0.545277
Name: 2, Length: 1000, dtype: float64

In [48]:
col[np.abs(col) > 3]

471   -3.509485
600   -3.481369
690    3.271069
737    3.025800
Name: 2, dtype: float64

* 在全部欄中，選擇值超過 3 或 -3 的列的話，可以呼叫布林DataFrame的any方法

In [49]:
data[(np.abs(data) > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
157,-1.212701,0.269279,0.520109,-3.480049
238,-3.149599,-1.671241,-0.928934,0.227863
471,0.432228,1.545776,-3.509485,1.131292
508,0.720314,3.21604,-1.167339,0.215136
600,1.065337,1.882669,-3.481369,0.250314
690,-0.718443,1.58216,3.271069,1.068597
737,0.79412,0.622819,3.0258,-0.992503
911,-0.193864,-3.76834,-0.325176,-0.320907


In [50]:
# test = np.abs(data) > 3
# test -> 可得出布林陣列mask
data[(np.abs(data) > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
157,-1.212701,0.269279,0.520109,-3.480049
238,-3.149599,-1.671241,-0.928934,0.227863
471,0.432228,1.545776,-3.509485,1.131292
508,0.720314,3.21604,-1.167339,0.215136
600,1.065337,1.882669,-3.481369,0.250314
690,-0.718443,1.58216,3.271069,1.068597
737,0.79412,0.622819,3.0258,-0.992503
911,-0.193864,-3.76834,-0.325176,-0.320907


* 這些條件也可以拿來當作給值的部分
* 嘗試把值保持在3 ~ -3之間

In [51]:
data[(np.abs(data)) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.033771,0.005591,-0.064918,0.008619
std,1.005967,0.97377,0.995377,0.987746
min,-3.0,-3.0,-3.0,-3.0
25%,-0.718333,-0.639726,-0.735754,-0.658675
50%,0.044885,-0.026683,-0.051145,0.022187
75%,0.662671,0.646198,0.635769,0.679859
max,2.891698,3.0,3.0,2.90658


* np.sign(data)會視data中的值是正數或是負數，產生 1 或 -1 

In [52]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,1.0
1,1.0,1.0,-1.0,1.0
2,-1.0,-1.0,-1.0,-1.0
3,-1.0,1.0,1.0,1.0
4,1.0,-1.0,-1.0,1.0


In [63]:
mask = np.abs(data) > 3
mask

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


## 排列與隨機取樣

* 若要對Series或DataFrame中的列作隨機排列(隨機重新排序)，只要使用numpy.random.permutation函式即可
* 呼叫permutation時，指定你想排列的軸的長度，即可得到內還新的次序的陣列

In [2]:
df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [3]:
sampler = np.random.permutation(5)
sampler

array([2, 4, 1, 0, 3])

* sampler可以用於iloc為基礎的索引，或等效的take函式裡

In [4]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [5]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
3,12,13,14,15


* 若要隨機選擇不重複子集的話，就使用Series或DataFrame的sample方法

In [6]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7


* 若要隨機選擇可重複子集的話，就在sample方法中指定replace=True

In [7]:
choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [8]:
draws = choices.sample(n=10, replace=True)
draws

4    4
1    7
3    6
1    7
1    7
0    5
1    7
1    7
0    5
1    7
dtype: int64

## 指標/虛擬變數

* 將一個類別變數，轉換為"虛擬(dummy)"或"指標(indicator)"陣列

* 假設DataFrame中的某一欄裡有k個相異值，可以創造出一個擁有k欄，內容為1或0的矩陣或DataFrame
* * pandas中有一個get_dummies函式可以達成上述要求

In [2]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [4]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


* 可以為DataFrame的欄位加上前綴，這個動作可以直接在get_dummies中完成

In [5]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [7]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


* 如果dataFrame中的一列同時屬於多個類別，則會較複雜

In [3]:
movie_names = ['movie_id', 'title', 'genres']
movie_names

['movie_id', 'title', 'genres']

In [8]:
path = r'D:\Python\Python 資料分析\範例資料集\datasets\movielens\movies.dat'
movies = pd.read_table(path, sep='::', header=None, names=movie_names, encoding='unicode_escape')
movies.head()

  return func(*args, **kwargs)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


* 為每一種電影做分類(genre)

In [9]:
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))

all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [11]:
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

* 一種建構指標DataFrame的方法是從填充0的DataFrame開始

In [18]:
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* 接下來從頭疊代過每一部電影，並在電影出現的dummies的列標記上1(在對應的分類上)
* 要做到這一點，需先得到分類的index -> 可使用dummies.columns來計算欄位的index

In [25]:
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [26]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

* 接下來使用.iloc配合上述的index便可設定值了

In [28]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1  # iloc[row, column]

* 最後再把結果與movies合併

In [31]:
movie_windic = movies.join(dummies.add_prefix('Genre_'))
movie_windic.head()

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
movie_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir                              0.0
Genre_Western       

* 如果是處理巨型資料的話，這個建構多項目指標變數的方法速度就會變慢 -> 可以直接寫到Numpy陣列的低階函式，再轉回一個DataFrame

* 如果是做統計應用，一個好用的用途是將get_dummies和離散函式(像cut這種)結合起來應用

* 使用numpy.random.seed設定隨機種子 -> 固定住範例中的隨機值

In [36]:
np.random.seed(12345)

In [37]:
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [38]:
bins = [0., 0.2, 0.4, 0.6, 0.8, 1]
bins

[0.0, 0.2, 0.4, 0.6, 0.8, 1]

In [39]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0
