# Advanced NumPy

In [1]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

In [21]:
tom='123'
print (type(tom))

<class 'str'>


## ndarray Object Internals

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [4]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

In [13]:
np.float64.mro()
a=np.ndarray([1,2,3])
c=np.ndarray([5,6])


TypeError: 'module' object is not callable

In [6]:
np.issubdtype(ints.dtype, np.number)

True

## Advanced Array Manipulation

### Reshaping Arrays

In [14]:
arr = np.arange(8)
arr
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [15]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [20]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [36]:
other_arr = np.ones((3, 5))
other_arr.shape
other_arr
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [23]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [32]:
a=np.ndarray([1,2,3,4,5])
# a.reshape((2,2))
a
# a.ravel()

array([[[[[2.4209e-321, 8.2003e-315, 4.5405e-321,         nan,
           1.1265e-321],
          [0.0000e+000, 6.1994e-071, 9.0910e+276, 3.7771e+233,
           6.5171e-038],
          [1.9163e-076, 9.1628e-072, 1.8145e-152, 9.4596e-076,
           6.2182e+175],
          [1.4214e-076, 1.1142e+218, 9.8980e+164, 2.9516e-075,
           7.1333e-067]],

         [[4.5634e-072, 9.4596e-076, 9.8980e+164, 1.4214e-076,
           2.9516e-075],
          [4.6749e-062, 9.8980e+164, 9.1628e-072, 2.1351e+257,
           1.6972e+137],
          [1.4357e-023, 1.4214e-076, 2.9516e-075, 6.0203e+175,
           2.9517e-075],
          [2.1341e+257, 6.6102e-062, 9.1628e-072, 1.4214e-076,
           1.0590e+218]],

         [[1.6141e+132, 9.8980e+164, 1.2666e+136, 9.0219e+217,
           6.1994e-071],
          [2.9515e-075, 2.1341e+257, 6.6102e-062, 9.1628e-072,
           1.4214e-076],
          [1.0590e+218, 1.6141e+132, 9.8980e+164, 1.4120e+136,
           9.0219e+217],
          [6.1994e-071, 2.95

In [33]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus Fortran Order

In [39]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [42]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
# np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [45]:
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [52]:
arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
# second
# third
print (arr)
np.split(arr, [1, 3])

[[ 0.8626 -0.01  ]
 [ 0.05    0.6702]
 [ 0.853  -0.9559]
 [-0.0235 -2.3042]
 [-0.6525 -1.2183]]


[array([[ 0.8626, -0.01  ]]), array([[ 0.05  ,  0.6702],
        [ 0.853 , -0.9559]]), array([[-0.0235, -2.3042],
        [-0.6525, -1.2183]])]

#### Stacking helpers: r_ and c_

In [55]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 0.8167,  0.4336,  3.    ],
       [ 1.0107,  1.8249,  4.    ],
       [-0.9975,  0.8506,  5.    ]])

In [56]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: tile and repeat

In [57]:
arr = np.arange(3)
arr
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [58]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [59]:
arr = np.random.randn(2, 2)
arr
arr.repeat(2, axis=0)

array([[-0.1316,  0.9124],
       [-0.1316,  0.9124],
       [ 0.1882,  2.1695],
       [ 0.1882,  2.1695]])

In [62]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

array([[-0.1316, -0.1316,  0.9124,  0.9124,  0.9124],
       [ 0.1882,  0.1882,  2.1695,  2.1695,  2.1695]])

In [63]:
arr
np.tile(arr, 2)

array([[-0.1316,  0.9124, -0.1316,  0.9124],
       [ 0.1882,  2.1695,  0.1882,  2.1695]])

In [67]:
arr
np.tile(arr, (2, 1))
# np.tile(arr, (3, 2))

array([[-0.1316,  0.9124],
       [ 0.1882,  2.1695],
       [-0.1316,  0.9124],
       [ 0.1882,  2.1695]])

### Fancy Indexing Equivalents: take and put

In [68]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [69]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [72]:
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
print(arr)
arr.take(inds, axis=1)

[[-0.5895  1.5817 -0.5287  0.457 ]
 [ 0.93   -1.5693 -1.0225 -0.4028]]


array([[-0.5287, -0.5895, -0.5287,  1.5817],
       [-1.0225,  0.93  , -1.0225, -1.5693]])

## Broadcasting

In [75]:
arr = np.xarange(5)
arr
arr * 4

array([ 0,  4,  8, 12, 16])

In [83]:
arr = np.random.randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
# demeaned.mean(0)

array([[ 0.0584, -0.9787, -0.4253],
       [-0.2744,  0.1495,  0.5759],
       [ 1.1753,  0.2218, -0.7119],
       [-0.9593,  0.6074,  0.5613]])

In [89]:
arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
# demeaned.mean(1)
demeaned.mean(0)

array([-0.4295,  0.7139, -0.2844])

### Broadcasting Over Other Axes

In [90]:
arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [91]:
arr - arr.mean(1).reshape((4, 1))

array([[ 0.0774,  0.1837, -0.2612],
       [-0.8542,  0.7131,  0.1412],
       [ 0.5174,  0.7072, -1.2247],
       [-1.4586,  1.2515,  0.2071]])

In [92]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

array([[ 1.1275, -0.5684,  0.3094]])

In [100]:
arr = np.random.randn(3, 4, 5)
arr
# depth_means = arr.mean(2)
# depth_means
# depth_means.shape
# demeaned = arr - depth_means[:, :, np.newaxis]
# demeaned.mean(2)

array([[[ 0.8454,  0.0165,  0.845 ,  1.8508,  0.0221],
        [-1.3692,  0.8872,  0.0143, -0.0742, -0.0486],
        [ 1.235 , -0.4333,  1.391 ,  0.8202, -0.2474],
        [ 0.3023,  0.544 , -0.9424, -1.2664,  0.9372]],

       [[-0.7201, -1.594 , -0.3755, -0.9587,  0.7943],
        [-1.6051,  0.5437,  0.9252, -1.4696, -0.3996],
        [ 1.4173, -0.8976,  1.8448,  1.2532, -1.4909],
        [-0.0277,  1.3752, -0.0252, -0.6679, -2.868 ]],

       [[ 0.2107,  1.2872, -0.5743,  0.4953,  0.396 ],
        [ 0.5888, -1.2818,  2.0299, -0.5019, -0.1593],
        [-1.4962,  0.0114,  0.4194,  2.0512, -0.3688],
        [-1.6893,  0.1477, -0.181 ,  0.1581, -0.3966]]])

```python
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]
```

### Setting Array Values by Broadcasting

In [101]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [102]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc Usage

### ufunc Instance Methods

In [103]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

45

In [104]:
np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

In [105]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

In [106]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [107]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [108]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17], dtype=int32)

In [109]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

### Writing New ufuncs in Python

In [110]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [111]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [112]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

2.18 ms ± 99.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.44 µs ± 60.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Structured and Record Arrays

In [113]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

In [114]:
sarr[0]
sarr[0]['y']

6

In [115]:
sarr['x']

array([1.5   , 3.1416])

### Nested dtypes and Multidimensional Fields

In [116]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [122]:
arr[1]['y']

0

In [123]:
arr['y']

array([0, 0, 0, 0])

In [124]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

array([1., 3.])

### Why Use Structured Arrays?

## More About Sorting

In [125]:
arr = np.random.randn(6)
arr.sort()
arr

array([-1.082 ,  0.3759,  0.8014,  1.1397,  1.2888,  1.8413])

In [126]:
arr = np.random.randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr

array([[-1.0111, -1.4711,  0.8705, -0.0847, -1.1329],
       [-0.3318, -0.3436,  2.1714,  0.1234, -0.0189],
       [ 0.1773,  0.7424,  0.8548,  1.038 , -0.329 ]])

In [127]:
arr = np.random.randn(5)
arr
np.sort(arr)
arr

array([-1.1181, -0.2415, -2.0051,  0.7379, -1.0614])

In [137]:
arr = np.random.randn(3, 5)
arr
arr.sort(axis=1)
arr

array([[-1.2136, -1.1441, -0.8704, -0.2306,  1.0438],
       [-0.4728, -0.3636, -0.1378,  0.8356,  2.1777],
       [-1.3918, -0.2089,  0.2316,  0.728 ,  1.9956]])

In [142]:
arr[:, ::-1]
# arr=[1,2,3,4,5]
# arr[::-1]
arr=arr[1]
arr[::-1]

array([ 2.1777,  0.8356, -0.1378, -0.3636, -0.4728])

### Indirect Sorts: argsort and lexsort

In [143]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]

array([0, 1, 2, 3, 5])

In [144]:
arr = np.random.randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [ 0.7189, -1.451 , -1.7942,  0.1051, -0.4286],
       [ 0.6117, -0.0912, -0.0402, -1.2799, -0.0003]])

In [146]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
# zip(last_name[sorter], first_name[sorter])

array([1, 2, 3, 0, 4], dtype=int64)

### Alternative Sort Algorithms

In [147]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

### Partially Sorting Arrays(没看懂)

# 测试字体

In [152]:
np.random.seed(12345)
arr = np.random.randn(20)
arr
np.partition(arr, 0)

array([-2.0016,  0.4789, -0.5194, -0.5557,  1.9658,  1.3934,  0.0929,
        0.2817,  0.769 ,  1.2464,  1.0072, -1.2962,  0.275 ,  0.2289,
        1.3529,  0.8864, -0.2047, -0.3718,  1.669 , -0.4386])

In [153]:
indices = np.argpartition(arr, 3)
indices
arr.take(indices)

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

### numpy.searchsorted: Finding Elements in a Sorted Array

In [154]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

In [155]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5], dtype=int64)

In [156]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

array([3, 7], dtype=int64)

In [157]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

In [158]:
labels = bins.searchsorted(data)
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3], dtype=int64)

In [159]:
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

## Writing Fast NumPy Functions with Numba

In [160]:
import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

```python
In [209]: x = np.random.randn(10000000)

In [210]: y = np.random.randn(10000000)

In [211]: %timeit mean_distance(x, y)
1 loop, best of 3: 2 s per loop

In [212]: %timeit (x - y).mean()
100 loops, best of 3: 14.7 ms per loop
```

```python
In [213]: import numba as nb

In [214]: numba_mean_distance = nb.jit(mean_distance)
```

```python
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count
```

```python
In [215]: %timeit numba_mean_distance(x, y)
100 loops, best of 3: 10.3 ms per loop
```

```python
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()
```

### Creating Custom numpy.ufunc Objects with Numba

```python
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y
```

```python
In [13]: x = np.arange(10)

In [14]: nb_add(x, x)
Out[14]: array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.])

In [15]: nb_add.accumulate(x, 0)
Out[15]: array([  0.,   1.,   3.,   6.,  10.,  15.,  21.,  28.,  36.,  45.])
```

## Advanced Array Input and Output

### Memory-Mapped Files

In [161]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [162]:
section = mmap[:5]

In [163]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [164]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[ 0.7584, -0.6605,  0.8626, ...,  0.6046, -0.6212,  2.0542],
        [-1.2113, -1.0375,  0.7093, ..., -1.4117, -0.1719, -0.8957],
        [-0.1419, -0.3375,  0.4329, ...,  1.2914, -0.752 , -0.44  ],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [166]:
%xdel mmap
!rm mymmap

NameError: name 'mmap' is not defined


'rm' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [177]:
arr=np.array([1,2,3,4,5])
np.
arr

array([1, 2, 3, 4, 5])

### HDF5 and Other Array Storage Options

## Performance Tips

### The Importance of Contiguous Memory

In [None]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous

In [None]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

In [None]:
arr_f.copy('C').flags

In [None]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

In [None]:
%xdel arr_c
%xdel arr_f

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS