# 12. NumPy 高級應用

In [2]:
%pylab inline
from pandas import Series, DataFrame
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


### ndarray物件的結構
ndarray的組成:
- 一個指向數組的 指標
- 一個 dtype
- 一個 shape
- 一個 strides，三元元組，例如 (160, 40, 8

In [12]:
# strides
np.ones((3, 4, 6), dtype = np.float64).strides

(192, 48, 8)

### NumPy數據類型體系
用 .issubdtype()可以判斷某一dtype是否屬於某大類
用 .mro() 可以列出某dtype的父類別

In [13]:
np.int32.mro()

[numpy.int32,
 numpy.signedinteger,
 numpy.integer,
 numpy.number,
 numpy.generic,
 object]

## 高級數組操作
### 數組重塑(reshape)

In [6]:
# 扁平化 : 將多維的 ndarray reshape成單一維度
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [16]:
# 使用 ravel()將之扁平化
arr.ravel() 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [20]:
# 使用 flatten()將之扁平化，會傳回副本
arr.flatten() 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C和Fortran順序
- Row優先(又稱 C順序, 預設): 每row中的數據是被存放在記憶體中相鄰的位置
- Column優先(又稱 F順序): 每column中的數據是被存放在記憶體中相鄰的位置

In [24]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [25]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [38]:
arr.ravel(order = 'C')

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [39]:
arr.ravel(order = 'F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### 數組的合併與拆分

In [33]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr1

array([[1, 2, 3],
       [4, 5, 6]])

In [34]:
arr2 = arr1 + 6
arr2

array([[ 7,  8,  9],
       [10, 11, 12]])

#### 合併

In [36]:
# np.concatenate() 可以將數組合併與連接
np.concatenate([arr1, arr2], axis = 0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [37]:
np.concatenate([arr1, arr2], axis = 1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [40]:
# vstack()，等同於 np.concatenate([], axis = 0)
np.vstack([arr1, arr2])

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [42]:
# hstack()，等同於 np.concatenate([], axis = 1)
np.hstack([arr1, arr2])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

#### 拆分

In [44]:
arr = randn(5, 2)
arr

array([[ 0.26126207,  0.04032701],
       [-0.0203294 , -0.08815528],
       [ 0.02571363, -1.85795482],
       [ 0.13715665, -0.66855806],
       [-1.5213498 , -1.84773959]])

In [52]:
# split() 用於拆分數組
first, second, third = np.split(arr, [1, 3])

In [53]:
first

array([[ 0.26126207,  0.04032701]])

In [54]:
second

array([[-0.0203294 , -0.08815528],
       [ 0.02571363, -1.85795482]])

In [55]:
third

array([[ 0.13715665, -0.66855806],
       [-1.5213498 , -1.84773959]])

#### 堆疊輔助類別: r\_ , c\_

In [58]:
arr = np.arange(6)
arr

array([0, 1, 2, 3, 4, 5])

In [60]:
arr1 = arr.reshape((3, 2))
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [61]:
arr2 = randn(3, 2)
arr2

array([[-0.24006995,  0.28522685],
       [-0.07091696,  1.38246482],
       [-0.01545152,  0.34472234]])

In [62]:
# vstack()
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-0.24006995,  0.28522685],
       [-0.07091696,  1.38246482],
       [-0.01545152,  0.34472234]])

In [63]:
# hstack()
np.c_[arr1, arr2]

array([[ 0.        ,  1.        , -0.24006995,  0.28522685],
       [ 2.        ,  3.        , -0.07091696,  1.38246482],
       [ 4.        ,  5.        , -0.01545152,  0.34472234]])

In [64]:
# nest
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-0.24006995,  0.28522685,  3.        ],
       [-0.07091696,  1.38246482,  4.        ],
       [-0.01545152,  0.34472234,  5.        ]])

In [65]:
# 可以將切片翻譯成數組
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### 元素的重複操作: tile, repeat
- repeat 會將數組中的各個元素重複一定的次數，而產生一個更大的樹組
- tile 會沿著指定的軸向堆疊數組的副本，像是在 鋪磁磚

#### repeat()

In [66]:
# repeat()
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [67]:
# 如果傳入的是一組整數，則各個元素可以重複不同的次數
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [74]:
arr = rand(2, 2)
arr

array([[ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ]])

In [76]:
# 可以指定重複的軸向
arr.repeat(2, axis = 0)

array([[ 0.83130354,  0.41893976],
       [ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ],
       [ 0.51981829,  0.241909  ]])

In [77]:
# 可以指定重複的軸向
arr.repeat(2, axis = 1)

array([[ 0.83130354,  0.83130354,  0.41893976,  0.41893976],
       [ 0.51981829,  0.51981829,  0.241909  ,  0.241909  ]])

In [80]:
# 不等同於 np.hstack([arr, arr])
np.hstack([arr, arr])

array([[ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ]])

In [81]:
# 如果沒有指定軸向，則數組會被扁平化
arr.repeat(2)

array([ 0.83130354,  0.83130354,  0.41893976,  0.41893976,  0.51981829,
        0.51981829,  0.241909  ,  0.241909  ])

In [85]:
arr

array([[ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ]])

In [86]:
# 對多維進行重複時，也可傳入數組，對不同的元素指定不同的重複次數
arr.repeat([2, 3], axis = 0)

array([[ 0.83130354,  0.41893976],
       [ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ],
       [ 0.51981829,  0.241909  ],
       [ 0.51981829,  0.241909  ]])

#### tile()

In [87]:
arr

array([[ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ]])

In [88]:
# tile()
np.tile(arr, 2)

array([[ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ]])

In [89]:
# 等同於 np.hstack([arr, arr])
np.hstack([arr, arr])

array([[ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ]])

In [90]:
# 但是 tile()可以接受一個元組作為參數，代表 鋪磁磚 的布局
np.tile(arr, (2, 1))

array([[ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ],
       [ 0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ]])

In [91]:
np.tile(arr, (3, 2))

array([[ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ],
       [ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ],
       [ 0.83130354,  0.41893976,  0.83130354,  0.41893976],
       [ 0.51981829,  0.241909  ,  0.51981829,  0.241909  ]])

### 花式索引的等價函數: take, put

In [93]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [94]:
inds = [7, 1, 2, 6]

In [120]:
arr[inds]

array([200,   0, 200, 100])

#### take

In [119]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [111]:
# 用 take() 取用索引來選擇元素
# 等於是花式索引，但是效能好很多
arr.take(inds)

array([700, 100, 200, 600])

In [113]:
# 用 put()來對選擇的元素賦值
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [116]:
# 可以用數列來賦值
arr.put(inds, [40, 41, 42, 43, 44, 45])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [117]:
# 要在其他軸上使用take，只需要指定軸向
arr = randn(2, 4)
arr

array([[-0.61982919,  0.96155205, -1.07420073,  0.06601651],
       [-2.47687067, -0.22656096, -0.20840595,  0.2828595 ]])

In [118]:
inds = [2, 0, 2, 1]

# 指定軸向，等於是重排內容
arr.take(inds, axis = 1)

array([[-1.07420073, -0.61982919, -1.07420073,  0.96155205],
       [-0.20840595, -2.47687067, -0.20840595, -0.22656096]])

## 廣播(broadcasting)

In [121]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [122]:
arr * 4
# "*4" 被廣播到每一個元素上

array([ 0,  4,  8, 12, 16])

In [127]:
arr = randn(4, 3)
arr

array([[-0.9653332 ,  0.17769332,  1.029921  ],
       [ 0.23625474,  1.71597799,  1.10146865],
       [-1.24837143, -0.03642103,  0.47223691],
       [-0.41621007, -2.64896373,  2.24753557]])

In [130]:
arr.mean(axis = 0)

array([-0.59841499, -0.19792836,  1.21279053])

In [131]:
# "- arr.mean(axis = 0)" 會被沿著 axis = 0 廣播
demeaned = arr - arr.mean(axis = 0)
demeaned

array([[-0.36691821,  0.37562168, -0.18286953],
       [ 0.83466973,  1.91390635, -0.11132188],
       [-0.64995644,  0.16150734, -0.74055362],
       [ 0.18220492, -2.45103537,  1.03474504]])

In [132]:
demeaned.mean(0)

array([ -4.16333634e-17,   0.00000000e+00,  -5.55111512e-17])

### 廣播的原則
如果兩個數組的後緣維度(trailing dimension，即是從末尾開始算起的維度)的軸長度相符或者其中一方的長度為 1 ，則認為它們是廣播兼容的。

廣播會在缺失和(或)長度為 1 的維度上進行

### 沿著其他軸向廣播
三維中的任何一維上廣播其實也就是將數據重塑為兼容的形狀而已

In [3]:
arr = np.zeros((4, 4))
arr.shape

(4, 4)

In [4]:
# newaxis屬性，用來插入一個新的軸
arr3d = arr[:, np.newaxis, :]

arr3d.shape

(4, 1, 4)

In [6]:
arr_1d = np.random.normal(size = 3)
arr_1d

array([ 0.039628  ,  0.63199028, -0.79764943])

In [8]:
# 增加軸 1，原本一維變成二維
arr_1d[:, np.newaxis]

array([[ 0.039628  ],
       [ 0.63199028],
       [-0.79764943]])

In [9]:
arr_1d[np.newaxis, :]

array([[ 0.039628  ,  0.63199028, -0.79764943]])

In [10]:
# 對一個三維數組 的軸2進行距平化
arr = randn(3, 4, 5)
arr

array([[[-1.26522492,  0.15513924, -0.08452234, -0.04353667,  0.60140275],
        [-0.09602717,  0.03139654, -0.04969375, -0.12531739,  0.54124433],
        [-1.44771967, -1.76079722, -1.3116039 ,  0.93001881,  0.76231292],
        [ 1.37154294,  0.01655661,  1.09416501,  0.31605779, -0.73727189]],

       [[-0.32059421, -1.20678059,  0.51362787,  0.90801217, -0.4029745 ],
        [-1.20733064, -1.49627164,  0.4975447 ,  0.45010609, -1.64971073],
        [ 1.14783968, -1.03635261, -0.15542985,  2.35082007,  0.24047841],
        [ 1.68379421, -0.90908505, -0.58478263,  0.31767176, -0.73268   ]],

       [[ 0.37928344,  1.1022068 ,  2.69290533, -1.37413896,  1.08390446],
        [ 0.78995632,  0.63669373,  0.67888799, -0.36617745, -0.90201314],
        [-1.63266277, -2.54877006, -0.48127643, -1.55593795, -0.99532979],
        [ 0.08115098,  1.8471723 , -0.44499786, -0.33273861,  0.34976455]]])

In [11]:
depth_means = arr.mean(2)
depth_means

array([[-0.12734839,  0.06032051, -0.56555781,  0.41221009],
       [-0.10174185, -0.68113244,  0.50947114, -0.04501634],
       [ 0.77683221,  0.16746949, -1.4427954 ,  0.30007027]])

In [12]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned

array([[[-1.13787654,  0.28248763,  0.04282605,  0.08381172,  0.72875114],
        [-0.15634768, -0.02892397, -0.11001426, -0.18563791,  0.48092382],
        [-0.88216185, -1.19523941, -0.74604609,  1.49557663,  1.32787073],
        [ 0.95933285, -0.39565348,  0.68195492, -0.0961523 , -1.14948199]],

       [[-0.21885236, -1.10503874,  0.61536972,  1.00975402, -0.30123265],
        [-0.52619819, -0.8151392 ,  1.17867714,  1.13123853, -0.96857829],
        [ 0.63836854, -1.54582375, -0.66490099,  1.84134893, -0.26899273],
        [ 1.72881055, -0.86406871, -0.53976629,  0.3626881 , -0.68766366]],

       [[-0.39754877,  0.32537458,  1.91607312, -2.15097118,  0.30707225],
        [ 0.62248683,  0.46922424,  0.5114185 , -0.53364694, -1.06948263],
        [-0.18986737, -1.10597466,  0.96151897, -0.11314255,  0.44746561],
        [-0.21891929,  1.54710203, -0.74506813, -0.63280888,  0.04969428]]])

In [13]:
demeaned.mean(2)

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -4.44089210e-17],
       [ -2.22044605e-17,  -8.88178420e-17,   0.00000000e+00,
          2.22044605e-17],
       [ -6.66133815e-17,   8.88178420e-17,   6.66133815e-17,
          1.11022302e-17]])

### 透過廣播設置數組的值
廣播原則 也適用於 算術運算

In [21]:
arr = np.zeros((4, 3))
arr

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [26]:
# arr[:] 索引的方式 代表整個數組
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [29]:
# 用一個 1D的數組 來設置目標數組的各列
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [31]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## ufunc高級應用
### ufunc實例方法

#### reduce() 接受一數組參數，並通過一系列的二元運算對其值做 聚合(求和)運算

In [43]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [36]:
# reduce 接受一數組參數，並通過一系列的二元運算對其值做 聚合(求和)運算
# np.add.reduce() 表示 是用 加法 作為聚合運算
np.add.reduce(arr)

45

In [44]:
arr.sum()

45

#### 檢查各 row中的數值是否是 有序的

In [73]:
# 檢查各 row中的數值是否是 有序的
arr = randn(5, 5)
arr

array([[-0.21832882, -1.03991441,  0.98796245, -0.85707075, -1.56620359],
       [ 1.09891764, -0.70546392,  1.93178763, -0.52056423,  0.32153182],
       [-0.24678413, -0.53722539, -0.62494246,  0.78002887, -0.86094834],
       [-0.64625533,  1.90411601,  0.12936838,  1.39355659, -0.9331232 ],
       [-0.56497943,  0.32258384,  1.92572461,  1.14654712,  0.64586667]])

In [74]:
arr[::2]

array([[-0.21832882, -1.03991441,  0.98796245, -0.85707075, -1.56620359],
       [-0.24678413, -0.53722539, -0.62494246,  0.78002887, -0.86094834],
       [-0.56497943,  0.32258384,  1.92572461,  1.14654712,  0.64586667]])

In [75]:
# 對row排序( axis = 1)
arr[::2].sort(axis = 1) 
# 沒有對全部的 row排序，故意留一些沒有照大小順序排列的數值 作為範例

In [76]:
arr

array([[-1.56620359, -1.03991441, -0.85707075, -0.21832882,  0.98796245],
       [ 1.09891764, -0.70546392,  1.93178763, -0.52056423,  0.32153182],
       [-0.86094834, -0.62494246, -0.53722539, -0.24678413,  0.78002887],
       [-0.64625533,  1.90411601,  0.12936838,  1.39355659, -0.9331232 ],
       [-0.56497943,  0.32258384,  0.64586667,  1.14654712,  1.92572461]])

In [77]:
arr[:, :-1] 

array([[-1.56620359, -1.03991441, -0.85707075, -0.21832882],
       [ 1.09891764, -0.70546392,  1.93178763, -0.52056423],
       [-0.86094834, -0.62494246, -0.53722539, -0.24678413],
       [-0.64625533,  1.90411601,  0.12936838,  1.39355659],
       [-0.56497943,  0.32258384,  0.64586667,  1.14654712]])

In [78]:
arr[:, 1:]

array([[-1.03991441, -0.85707075, -0.21832882,  0.98796245],
       [-0.70546392,  1.93178763, -0.52056423,  0.32153182],
       [-0.62494246, -0.53722539, -0.24678413,  0.78002887],
       [ 1.90411601,  0.12936838,  1.39355659, -0.9331232 ],
       [ 0.32258384,  0.64586667,  1.14654712,  1.92572461]])

In [79]:
# 檢查各 row中的數值是否是 有序的
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False,  True],
       [ True,  True,  True,  True],
       [ True, False,  True, False],
       [ True,  True,  True,  True]], dtype=bool)

#### np.logical_and.reduce()

In [81]:
# axis = 1 的方向，去判斷 每 row 做 AND的結果
# np.logical_and.reduce() 表示 是用 AND 作為聚合運算
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis = 1)

array([ True, False,  True, False,  True], dtype=bool)

In [82]:
# np.array 物件有 all()方法可用，效果一樣
(arr[:, :-1] < arr[:, 1:]).all(axis = 1)

array([ True, False,  True, False,  True], dtype=bool)

#### np.add.accumulate()
累計

In [83]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [84]:
np.add.accumulate(arr, axis = 1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

#### np.multiply.outer()
交叉乘積

In [85]:
arr = np.arange(3).repeat([1, 2, 3])
arr

array([0, 1, 1, 2, 2, 2])

In [87]:
arr2 = np.arange(5)
arr2

array([0, 1, 2, 3, 4])

In [88]:
np.multiply.outer(arr, arr2)

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

#### np.substract.outer()
交叉減法

In [90]:
np.subtract.outer(arr, arr2)

array([[ 0, -1, -2, -3, -4],
       [ 1,  0, -1, -2, -3],
       [ 1,  0, -1, -2, -3],
       [ 2,  1,  0, -1, -2],
       [ 2,  1,  0, -1, -2],
       [ 2,  1,  0, -1, -2]])

#### reduceat()
用於計算 局部約簡，就是對數據各切片進行聚合的 groupby 運算。

In [91]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [92]:
np.add.reduceat(arr, [0, 5, 8])
# 就是在 arr[0:5], arr[5:8], arr[8:] 進行 reduce

array([10, 18, 17], dtype=int32)

In [93]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

In [94]:
np.add.reduceat(arr, [0, 2, 4], axis = 1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

### 自定義 ufunc
numpy.frompyfunc()接受一個Python函數以及兩個分別表示輸入輸出參數數量的整數

返回一個自定義的 ufunc函數，此ufunc返回的一定是一個 Python數組

In [110]:
def add_elements(x, y):
    return x + y

add_them = np.frompyfunc(add_elements, 2, 1)

add_them(np.arange(10), np.arange(10))

array([0, 2, 4, 6, 8, 10, 12, 14, 16, 18], dtype=object)

numpy.vectorize()接受一個Python函數以及一個 otypes 參數，定義返回的資料型態

In [115]:
add_them = np.vectorize(add_elements, otypes = [np.float64])
add_them(np.arange(10), np.arange(10))

array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.])

## 結構化和紀錄式數組
 用元組列表 自定義 結構化 dtype

In [21]:
# 用元組列表 自定義 結構化 dtype
my_dtype = [('x', np.float64), ('y', np.int32)]
# dtype=[('x', '<f8'), ('y', '<i4')]

In [22]:
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype = my_dtype)
sarr

array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [23]:
sarr[0]

(1.5, 6)

In [25]:
sarr[1]

(3.141592653589793, -2)

In [24]:
type(sarr[0])

numpy.void

In [26]:
type(sarr)

numpy.ndarray

In [13]:
sarr[0]['y']

6

In [27]:
sarr = np.array([(1.5, 6), (np.pi, -2), (3.2, 8), (np.pi * 2 , -7)], dtype = my_dtype)
sarr

array([(1.5, 6), (3.141592653589793, -2), (3.2, 8), (6.283185307179586, -7)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [28]:
sarr['x']

array([ 1.5       ,  3.14159265,  3.2       ,  6.28318531])

In [29]:
sarr['y']

array([ 6, -2,  8, -7])

In [31]:
sarr[2]

(3.2, 8)

### 嵌套dtype和多維字段
ndarray只能儲存同質的資料，但是可以用自訂的 dtype來擴增能力

In [36]:
my_dtype = [('x', np.float64, 3), ('y', np.int32)]
# 'x'字段有 3個浮點數
# 'y'字段有 1個整數

In [37]:
arr = np.zeros(4, dtype = my_dtype)
arr

array([([0.0, 0.0, 0.0], 0), ([0.0, 0.0, 0.0], 0), ([0.0, 0.0, 0.0], 0),
       ([0.0, 0.0, 0.0], 0)], 
      dtype=[('x', '<f8', (3,)), ('y', '<i4')])

In [38]:
arr[0]['x']

array([ 0.,  0.,  0.])

In [39]:
arr[0]['y']

0

In [40]:
# 'x'字段可以得到一個二維數組
arr['x']

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

#### 嵌套dtype

In [43]:
my_dtype = [('x', [('a', np.float64), ('b', np.float32)]), ('y', np.int32)]
arr = np.array([((1, 2), 5), ((3, 4), 6)], dtype = my_dtype)
arr

array([((1.0, 2.0), 5), ((3.0, 4.0), 6)], 
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [44]:
arr['x']

array([(1.0, 2.0), (3.0, 4.0)], 
      dtype=[('a', '<f8'), ('b', '<f4')])

In [45]:
arr['y']

array([5, 6])

In [46]:
arr['x']['a']

array([ 1.,  3.])

In [49]:
arr['x']['a'].dtype

dtype('float64')

### 為什麼要用結構化數組
因為屬於 固定字節數的 固定長度資料結構，可以提供非常快速高效的存取
### 結構化數組操作: numpy.lib.refunctions

## 更多有關排序的話題

In [63]:
arr = randn(6)
arr

array([ 1.07223164, -0.49384651,  1.74781661, -0.11494868, -1.30397657,
        0.81905739])

#### ndarray.sort()

In [64]:
# ndarray.sort() 會就地排序，不會產生新的數組
arr.sort()
arr

array([-1.30397657, -0.49384651, -0.11494868,  0.81905739,  1.07223164,
        1.74781661])

In [78]:
arr = randn(3, 5)
arr

array([[ 0.50765667,  0.13053376, -0.49621553, -1.78188096, -0.45861744],
       [-0.56842391, -0.26221506,  1.20457985,  1.94964405,  0.73438759],
       [ 0.58482429, -1.42624548,  1.06026055, -0.2903431 , -2.85091262]])

In [66]:
# 只針對第一個 column 排序
arr[:, 0].sort()
arr

array([[-2.0048596 , -0.16599244,  0.81818098,  1.08224962, -0.37496186],
       [-1.46771527, -0.93579505,  0.61603112,  1.84495084, -0.59733691],
       [ 0.8200751 ,  0.12321585, -1.69390146,  1.24965976,  1.40237377]])

#### numpy.sort() 會傳回一個副本

In [94]:
arr = randn(3, 5)
arr

array([[ 0.81974354, -1.41690087, -1.07272532,  0.76264259, -0.31023086],
       [-0.94848597, -0.91820321,  2.4823545 , -2.6375055 ,  0.67283374],
       [-1.28933705,  1.27322122, -0.7687702 , -0.71963699,  1.71392226]])

In [95]:
# numpy.sort() 會傳回一個副本
np.sort(arr)

array([[-1.41690087, -1.07272532, -0.31023086,  0.76264259,  0.81974354],
       [-2.6375055 , -0.94848597, -0.91820321,  0.67283374,  2.4823545 ],
       [-1.28933705, -0.7687702 , -0.71963699,  1.27322122,  1.71392226]])

In [104]:
arr
# arr並沒有改變

array([[-2.6375055 , -1.41690087, -1.28933705, -1.07272532, -0.94848597],
       [-0.91820321, -0.7687702 , -0.71963699, -0.31023086,  0.67283374],
       [ 0.76264259,  0.81974354,  1.27322122,  1.71392226,  2.4823545 ]])

In [105]:
# 可以指定軸向 axis = 1，預設是 1
np.sort(arr, axis = 0)

array([[-2.6375055 , -1.41690087, -1.28933705, -1.07272532, -0.94848597],
       [-0.91820321, -0.7687702 , -0.71963699, -0.31023086,  0.67283374],
       [ 0.76264259,  0.81974354,  1.27322122,  1.71392226,  2.4823545 ]])

In [106]:
# 反序
arr[:, :: -1]

array([[-0.94848597, -1.07272532, -1.28933705, -1.41690087, -2.6375055 ],
       [ 0.67283374, -0.31023086, -0.71963699, -0.7687702 , -0.91820321],
       [ 2.4823545 ,  1.71392226,  1.27322122,  0.81974354,  0.76264259]])

In [101]:
# 整體 重新排序
shape = arr.shape
arr.reshape(1, -1).sort()
arr.reshape(shape)
arr

array([[-2.6375055 , -1.41690087, -1.28933705, -1.07272532, -0.94848597],
       [-0.91820321, -0.7687702 , -0.71963699, -0.31023086,  0.67283374],
       [ 0.76264259,  0.81974354,  1.27322122,  1.71392226,  2.4823545 ]])

### 間接排序: argsort,  lexsort

In [118]:
values = np.array([5, 0, 1, 3, 2])
values

array([5, 0, 1, 3, 2])

#### argsort() 傳回用來排序的索引

In [119]:
# argsort() 傳回用來排序的索引
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0], dtype=int64)

In [120]:
values[indexer]

array([0, 1, 2, 3, 5])

In [121]:
values.take(indexer)

array([0, 1, 2, 3, 5])

In [126]:
arr = randn(3, 5)
arr

array([[-0.76035834,  1.626074  , -0.68072317, -0.46244352,  0.11770222],
       [-1.60536628, -0.13172563, -1.78419362, -0.71145818,  0.27498426],
       [-0.73181068, -0.29279232, -1.47241627,  0.4656092 , -0.88666731]])

In [127]:
arr[0] = values
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [-1.60536628, -0.13172563, -1.78419362, -0.71145818,  0.27498426],
       [-0.73181068, -0.29279232, -1.47241627,  0.4656092 , -0.88666731]])

In [128]:
# 使得 columns會依照 row 1的值重新排序，在 axis = 1 的方向
arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [-0.13172563, -1.78419362,  0.27498426, -0.71145818, -1.60536628],
       [-0.29279232, -1.47241627, -0.88666731,  0.4656092 , -0.73181068]])

#### lexsort() 傳回用來排序的多層索引

In [152]:
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])

In [153]:
# 根據多的數組 zip之後的結果進行排序
# 前提是各個數組之間是配對的，並不是獨立的數組
# 排的是 間接排序，又叫 字典序
# 需要注意的是，lexsort() 鍵的應用順序是最後一個傳入的開始算起 (如下例: last_name 優先排序)
sorter = np.lexsort((first_name, last_name))
sorter

array([1, 2, 3, 0, 4], dtype=int64)

In [154]:
last_name[sorter]

array(['Arnold', 'Arnold', 'Jones', 'Jones', 'Walters'], 
      dtype='<U7')

In [155]:
first_name[sorter]

array(['Jane', 'Steve', 'Bill', 'Bob', 'Barbara'], 
      dtype='<U7')

In [156]:
people = zip(last_name[sorter], first_name[sorter])
list(people)

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### 其他排序算法
stable排序算法會保持等價元素的相對位置

In [160]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])

mergesort 是唯一的穩定排序方法

In [159]:
indexer = key.argsort(kind = 'mergesort')
indexer
# 同樣 key值的元素經過排序之後，還是會維持前後位置的相對關係

array([2, 3, 4, 0, 1], dtype=int64)

In [158]:
key[indexer]

array([1, 1, 1, 2, 2])

In [161]:
values[indexer]

array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='<U8')

### numpy.searchsorted: 在有序數組中查找元素
運用二分查找法，只要將值插入到searchsorted()返回的那個位置，就可以維持數組的有序性

In [165]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)
# 9 應該排在索引 3

3

In [166]:
# 可以傳入一個數組，返回一組索引
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5], dtype=int64)

In [167]:
# 如果插入值與數組中某個元素相等，預設會傳回 左邊 的索引值
arr.searchsorted(0)

0

In [168]:
arr.searchsorted([1, 7])

array([1, 2], dtype=int64)

In [169]:
# 可以用 side 參數改變，取右邊的索引值
arr.searchsorted(0, side = 'right')

1

使用 面元邊界數組，將另外一個數組拆分

In [170]:
data = np.floor(np.random.uniform(0, 10000, size = 50))
data

array([ 2261.,  6826.,  5604.,  1501.,  2606.,  6859.,  6945.,  3584.,
        2235.,  4141.,  1859.,  3265.,  6750.,  7003.,  9744.,   121.,
        8233.,  3895.,  9367.,  9916.,   862.,  5018.,  2530.,  3132.,
        9279.,  2032.,  5293.,   695.,  3103.,  9917.,  4658.,  2432.,
        2379.,  5267.,  8398.,  9950.,  6964.,  7714.,  9675.,  9730.,
        2216.,  2875.,  9457.,  8909.,  1011.,  3618.,  3779.,  7224.,
        1044.,  2298.])

In [172]:
bins = np.array([0, 100, 1000, 5000, 10000])
bins

array([    0,   100,  1000,  5000, 10000])

In [173]:
# 使用 searchsorted() 求取 data中各元素在 bins中的落點
labels = bins.searchsorted(data)
labels

array([3, 4, 4, 3, 3, 4, 4, 3, 3, 3, 3, 3, 4, 4, 4, 2, 4, 3, 4, 4, 2, 4, 3,
       3, 4, 3, 4, 2, 3, 4, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 3, 3,
       3, 4, 3, 3], dtype=int64)

In [175]:
# NumPy中的 digitize()也可以用來計算面元編號
np.digitize(data, bins)

array([3, 4, 4, 3, 3, 4, 4, 3, 3, 3, 3, 3, 4, 4, 4, 2, 4, 3, 4, 4, 2, 4, 3,
       3, 4, 3, 4, 2, 3, 4, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 3, 3,
       3, 4, 3, 3], dtype=int64)

In [174]:
# 使用 pandas中的 groupby()將原數組作拆分
Series(data).groupby(labels).mean()

2     559.333333
3    2715.391304
4    7918.416667
dtype: float64

## NumPy的 matrix類別

In [179]:
Xm = np.matrix(np.random.randn(5, 5))
Xm
# matrix 類別

matrix([[-0.36747101, -1.40623707,  0.47267012,  0.86075671, -0.49218872],
        [-0.18896879,  1.37319066, -1.54529651, -1.10584534,  1.49632628],
        [ 1.5245477 , -1.77409937,  0.5060452 ,  0.78673661, -0.04836532],
        [ 0.39930761,  0.97829518, -0.46998717, -0.69973894,  0.17495329],
        [ 0.39075469,  1.6158142 ,  0.14181245,  0.01344745,  1.76655131]])

In [180]:
Ym = Xm[:, 0]
Ym

matrix([[-0.36747101],
        [-0.18896879],
        [ 1.5245477 ],
        [ 0.39930761],
        [ 0.39075469]])

In [181]:
# '*' 就代表 矩陣乘法，等於 np.dot
Ym.T * Xm * Ym

matrix([[ 0.94775764]])

In [183]:
# matrix 的 I 屬性，與原來的矩陣相乘之後，得到單位矩陣
Xm.I * Xm

matrix([[  1.00000000e+00,  -2.22044605e-16,  -8.32667268e-17,
           1.86482774e-16,   3.88578059e-16],
        [  1.11022302e-16,   1.00000000e+00,  -7.63278329e-16,
          -1.12757026e-16,   2.22044605e-16],
        [ -7.77156117e-16,   0.00000000e+00,   1.00000000e+00,
           2.69749501e-15,  -2.22044605e-16],
        [  7.77156117e-16,  -1.33226763e-15,  -1.16573418e-15,
           1.00000000e+00,   8.88178420e-16],
        [ -1.94289029e-16,   6.66133815e-16,   5.41233725e-16,
           1.18828558e-16,   1.00000000e+00]])

## 高級數組輸入輸出

### 內存映像(memory map)文件
把數組放在硬碟，但視為是在記憶體中

類似於 ndarray 的 memmap類別，允許將大文件分成小段進行讀寫，而不是一次性將整個數組讀入記憶體

ndarray 有的方法，memmap也有

In [2]:
import numpy as np
mmap = np.memmap('mymmap', dtype = np.float64, mode = 'w+', shape = (100, 100))
mmap

memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [3]:
mmap[:] = 1
mmap

memmap([[ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.]])

In [4]:
section = mmap[:5]
section[:2]

memmap([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  

In [5]:
section[:] = np.random.randn(5, 100)  

In [6]:
mmap

memmap([[-0.9566111 , -0.22545429,  0.57252126, ..., -2.31600051,
         0.39282768,  0.21502444],
       [-0.6845728 ,  0.81913225,  0.7084294 , ..., -0.82658413,
        -0.86305138, -0.7190805 ],
       [ 2.11015658, -0.50932402, -0.23480322, ...,  0.26192852,
         0.78301072,  0.25949802],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ]])

In [7]:
# flush()會將資料寫入硬碟
mmap.flush()

In [8]:
mmap

memmap([[-0.9566111 , -0.22545429,  0.57252126, ..., -2.31600051,
         0.39282768,  0.21502444],
       [-0.6845728 ,  0.81913225,  0.7084294 , ..., -0.82658413,
        -0.86305138, -0.7190805 ],
       [ 2.11015658, -0.50932402, -0.23480322, ...,  0.26192852,
         0.78301072,  0.25949802],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ]])

In [9]:
del mmap
# memmap 被回收之時，資料會被寫入到硬碟

In [10]:
# 讀入資料的時候，需要指定 資料型態(dtype) 與 形狀(shape)
mmap = np.memmap('mymmap', dtype = np.float64, shape = (100, 100))
mmap

memmap([[-0.9566111 , -0.22545429,  0.57252126, ..., -2.31600051,
         0.39282768,  0.21502444],
       [-0.6845728 ,  0.81913225,  0.7084294 , ..., -0.82658413,
        -0.86305138, -0.7190805 ],
       [ 2.11015658, -0.50932402, -0.23480322, ...,  0.26192852,
         0.78301072,  0.25949802],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ]])

In [11]:
del mmap

### HDF5及其他數組存儲方式
- PyTables
- h5py (HDF5)

## 性能建議
- 將Python循環和條件邏輯 轉換為 數組運算和boolean數組運算
- 盡量使用 廣播
- 避免複製數據，盡量使用數組view(即slice)
- 利用 ufunc及其各種方法

### 連續內存的重要性
- C型連續: row 優先
- F型連續: column 優先

In [26]:
arr_c = np.ones((1000, 1000), order = 'C')
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [27]:
arr_f = np.ones((1000, 1000), order = 'F')
arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [28]:
arr_f.flags.f_contiguous

True

In [29]:
%timeit arr_c.sum(1)

100 loops, best of 3: 3.58 ms per loop


In [30]:
%timeit arr_c.sum(0)

100 loops, best of 3: 2.15 ms per loop


In [31]:
%timeit arr_f.sum(1)

100 loops, best of 3: 2.07 ms per loop


In [32]:
%timeit arr_f.sum(0)

100 loops, best of 3: 3.21 ms per loop


In [33]:
# 用 copy()，複製數組 並調整 連續性
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False