为什么使用 numpy & pandas?

运算速度快：numpy 和 pandas 都是采用 C 语言编写, pandas 又是基于 numpy, 是 numpy 的升级版本。
消耗资源少：采用的是矩阵运算，会比 python 自带的字典或者列表快好多

# Numpy

In [1]:
import numpy as np
# 为了方便使用 numpy 采用np简写

常用属性

In [2]:
array = np.array([[1, 2, 3], [2, 3, 4]])
# 维度
print('number of dim:', array.ndim)

# 行数和列数
print('shape :', array.shape)

# 元素个数
print('size:', array.size)


number of dim: 2
shape : (2, 3)
size: 6


指定数据 dtype

In [3]:
# int32
a = np.array([2, 23, 4], dtype=np.int32)
print(a.dtype)

# int 64
a = np.array([2, 23, 4], dtype=np.int64)
print(a.dtype)

# float32
a = np.array([2, 23, 4], dtype=np.float32)
print(a.dtype)

# float64
a = np.array([2, 23, 4], dtype=np.float64)
print(a.dtype)

int32
int64
float32
float64


其他创建数组的方法

In [12]:
# 创建全零数组
a = np.zeros((3, 4))  # 数据全为0，3行4列
print(a)

# 创建全一数组
a = np.ones((3, 4))
print(a)
a = np.ones((3, 4), dtype=int)
print(a)

# 创建全空数组，每个值都是接近于零的数
a = np.empty((3, 4))
print(a)

# 用 arange 创建连续数组
a = np.arange(10, 20, 2)  # 10-19 的数据，2步长
print(a)

# 用 linspace 创建线段型数据
a = np.linspace(1, 10, 20)
print(a)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[10 12 14 16 18]
[ 1.          1.47368421  1.94736842  2.42105263  2.89473684  3.36842105
  3.84210526  4.31578947  4.78947368  5.26315789  5.73684211  6.21052632
  6.68421053  7.15789474  7.63157895  8.10526316  8.57894737  9.05263158
  9.52631579 10.        ]


In [4]:
# 使用 reshape 改变数据的形状
a = np.arange(12)
print(a)
a = a.reshape((3, 4))
print(a)

[ 0  1  2  3  4  5  6  7  8  9 10 11]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


Numpy 基础运算

In [16]:
a = np.array([10, 20, 30, 40])
b = np.arange(4)
print(a)
print(b)

[10 20 30 40]
[0 1 2 3]


In [15]:
a - b

array([10, 19, 28, 37])

In [17]:
a + b

array([10, 21, 32, 43])

In [18]:
b ** 2

array([0, 1, 4, 9])

In [19]:
10 * np.sin(a)

array([-5.44021111,  9.12945251, -9.88031624,  7.4511316 ])

In [20]:
# 逻辑判断，返回的是一个bool类型的矩阵，即对满足要求的返回True，不满足的返回False。
b < 3

array([ True,  True,  True, False])

In [5]:
a = np.array([[1, 1], [0, 1]])
b = np.arange(4).reshape((2, 2))
print(a)
print(b)

[[1 1]
 [0 1]]
[[0 1]
 [2 3]]


In [28]:
np.dot(a, b)

array([[2, 4],
       [2, 3]])

In [29]:
a.dot(b)

array([[2, 4],
       [2, 3]])

In [3]:
a = np.random.random((2, 4))
print(a)

[[0.96770168 0.54498583 0.30134859 0.34799256]
 [0.25361587 0.3486152  0.67038025 0.14866291]]


In [36]:
np.sum(a)

4.390487377810432

In [37]:
np.min(a)

0.18224247994699339

In [38]:
np.max(a)

0.9641062966541291

In [42]:
print("sum =", np.sum(a, axis=1))

print("min =", np.min(a, axis=0))

print("max =", np.max(a, axis=1))


sum = [1.95393622 2.43655116]
min = [0.18224248 0.58294975 0.52635262 0.29623387]
max = [0.66239137 0.9641063 ]


In [4]:
a = np.arange(2, 14).reshape((3, 4))
print(a)

[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]


In [5]:
# 求矩阵中最小元素和最大元素的索引
np.argmin(a)

0

In [6]:
np.argmax(a)

11

In [7]:
np.mean(a)

7.5

In [8]:
a.mean()

7.5

In [9]:
np.average(a)

7.5

In [10]:
np.median(a)

7.5

In [11]:
# 累加运算函数
np.cumsum(a)

array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])

In [12]:
# 累差运算函数
np.diff(a)

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

In [13]:
np.nonzero(a)
# 将所有非零元素的行与列坐标分割开，重构成两个分别关于行和列的矩阵

(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64),
 array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))

In [14]:
np.sort(a)

array([[ 2,  3,  4,  5],
       [ 6,  7,  8,  9],
       [10, 11, 12, 13]])

In [15]:
np.transpose(a)

array([[ 2,  6, 10],
       [ 3,  7, 11],
       [ 4,  8, 12],
       [ 5,  9, 13]])

In [16]:
a.T

array([[ 2,  6, 10],
       [ 3,  7, 11],
       [ 4,  8, 12],
       [ 5,  9, 13]])

In [17]:
np.clip(a, 5, 9)
# 这个函数的格式是np.clip(Array,Array_min,Array_max)
# Array指的是将要被执行用的矩阵
# 后面的最小值最大值则用于让函数判断矩阵中元素是否有比最小值小的或者比最大值大的元素，并将这些指定的元素转换为最小值或者最大值

array([[5, 5, 5, 5],
       [6, 7, 8, 9],
       [9, 9, 9, 9]])

索引

In [18]:
a = np.arange(3, 15)
print(a)
print(a[3])

[ 3  4  5  6  7  8  9 10 11 12 13 14]
6


In [21]:
a.reshape((3, 4))
print(a)
print(a[2])

[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
[11 12 13 14]


In [22]:
a[1][1]

8

In [23]:
a[1, 1]

8

In [24]:
a[1, 1:3]

array([8, 9])

In [25]:
for row in a:
    print(row)

[3 4 5 6]
[ 7  8  9 10]
[11 12 13 14]


In [27]:
for column in a.T:
    print(column)

[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
[ 6 10 14]


In [28]:
a.flatten()

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [32]:
for item in a.flat:
    print(item)

3
4
5
6
7
8
9
10
11
12
13
14


合并

In [61]:
A = np.array([1, 1, 1])
B = np.array([2, 2, 2])

print(np.vstack((A, B)))  # vertical stack

[[1 1 1]
 [2 2 2]]


In [34]:
np.hstack((A, B))

array([1, 1, 1, 2, 2, 2])

In [35]:
A[np.newaxis, :]

array([[1, 1, 1]])

In [36]:
A[:, np.newaxis]

array([[1],
       [1],
       [1]])

In [62]:
np.concatenate((A, B, B, A), axis=0)

array([1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1])

In [63]:
np.concatenate((A, B, B, A), axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1

In [65]:
a = np.arange(12).reshape((3, 4))
print(a)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [68]:
np.split(a, 3, axis=0)


[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]

In [69]:
np.split(a, 2, axis=1)

[array([[0, 1],
        [4, 5],
        [8, 9]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

In [71]:
np.split(a, 3, axis=1)

ValueError: array split does not result in an equal division

In [72]:
np.array_split(a, 3, axis=1)

[array([[0, 1],
        [4, 5],
        [8, 9]]),
 array([[ 2],
        [ 6],
        [10]]),
 array([[ 3],
        [ 7],
        [11]])]

In [73]:
np.vsplit(a, 3)

[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]

In [74]:
np.hsplit(a, 2)

[array([[0, 1],
        [4, 5],
        [8, 9]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

拷贝

In [3]:
a = np.arange(4)
print(a)

b = a
c = a
d = b

[0 1 2 3]


In [4]:
a[0] = 11
print(b)

[11  1  2  3]


In [5]:
print(b is a)
print(c is a)
print(d is a)

True
True
True


In [6]:
d[1:3] = [22, 33]
print(a)
print(b)
print(c)

[11 22 33  3]
[11 22 33  3]
[11 22 33  3]


In [7]:
# deep copy
b = a.copy()

b[1:3] = [33, 44]
print(b)
print(a)

[11 33 44  3]
[11 22 33  3]
