# NumPy 数据分析

## 1. 导入numpy， 并查看版本

In [1]:
import numpy as np
print(np.__version__)

1.20.1


## 2. 创建一维数组

In [2]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## 3. 创建布尔型数组

In [4]:
arr_bool1 = np.full((3,), True, dtype=bool)
arr_bool1

array([ True,  True,  True])

In [5]:
arr_bool2 = np.full((3, 3), True, dtype=bool)
arr_bool2

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

## 4. 从一堆数组中提取满足指定条件的元素

In [6]:
arr4 = np.arange(10)
arr4

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [9]:
arr4[arr4 % 2  == 1]

array([1, 3, 5, 7, 9])

## 5. 从numpy数组中的另一个值替换满足条件的元素项

In [11]:
arr5 = np.arange(10)
arr5

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [13]:
arr5[arr5 % 2 == 0] = -1
arr5

array([-1,  1, -1,  3, -1,  5, -1,  7, -1,  9])

## 6. 不影响原始数组的情况下替换满足条件的元素项

In [14]:
arr6 = np.arange(10)
arr6

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
np.where(arr6 % 2 == 1, -1, arr6)

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

## 7. 改变数组形状

In [46]:
arr7 = np.arange(10)
arr7

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [47]:
arr7.reshape(2, -1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

## 8. 如何垂直/水平叠加两个元素

In [48]:
arr8_a = np.arange(10).reshape(2, -1)
arr8_b = np.repeat(1, 10).reshape(2, -1)

In [51]:
print(arr8_a)
print(arr8_b)

[[0 1 2 3 4]
 [5 6 7 8 9]]
[[1 1 1 1 1]
 [1 1 1 1 1]]


In [63]:
# method 1
# axis 连接轴 0为x轴，1为y轴
np.concatenate([arr8_a, arr8_b], axis=0)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [64]:
np.concatenate([arr8_a, arr8_b], axis=1)

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [59]:
# method 2
# vstack 垂直方向连接两个数组，x轴
# hstack 水平方向连接两个数组，y轴
# dstack 沿深度方向叠加两个数组，z轴
np.vstack([arr8_a, arr8_b])

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [65]:
np.hstack([arr8_a, arr8_b])

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [66]:
np.dstack([arr8_a, arr8_b])

array([[[0, 1],
        [1, 1],
        [2, 1],
        [3, 1],
        [4, 1]],

       [[5, 1],
        [6, 1],
        [7, 1],
        [8, 1],
        [9, 1]]])

In [57]:
# method 3
np.r_[arr8_a, arr8_b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [67]:
np.c_[arr8_a, arr8_b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

## 10. 在无硬编码的情况下生成numpy中的自定义序列

In [71]:
arr10 = np.array([1, 2, 3])
# repeat  重复数组的元素
# tile 通过重复np对象来构建新np对象
# unique 找出数组的唯一元素
np.r_[np.repeat(arr10, 3), np.tile(arr10, 3)]


array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

## 11. 获取两个数组的公共项（交集）

In [73]:
arr11_a = np.array([1,2,3,2,3,4,3,4,5,6])
arr11_b = np.array([7,2,10,2,7,4,9,4,9,8])
np.intersect1d(arr11_a, arr11_b)

array([2, 4])

In [76]:
arr11_c = np.array([[1,2],[3,4]])
arr11_d = np.array([[5,2],[6,4]])
np.intersect1d(arr11_c, arr11_d)

array([2, 4])

## 12. 从一个数组中删除存在于另一个数组中的项（差集）

In [77]:
arr12_a = np.array([1,2,3,4,5])
arr12_b = np.array([5,6,7,8,9])

np.setdiff1d(arr12_a, arr12_b)

array([1, 2, 3, 4])

## 13. 获取两个数组中匹配的位置

In [78]:
arr13_a = np.array([1,2,3,2,3,4,3,4,5,6])
arr13_b = np.array([7,2,10,2,7,4,9,4,9,8])
np.where(arr13_a == arr13_b)

(array([1, 3, 5, 7]),)

## 14. 从Numpy中提取指定范围的数字

In [79]:
# 获取5到10的所有项目
arr14 = np.array([2, 6, 1, 9, 10, 3, 27])

In [83]:
# method 1
index = np.where((arr14 >= 5) & (arr14 <= 10))
arr14[index]

array([ 6,  9, 10])

In [86]:
# method 2
index = np.where(np.logical_and(arr14>=5, arr14<=10))
arr14[index]

array([ 6,  9, 10])

In [88]:
# method 3
arr14[(arr14 >= 5) & (arr14 <= 10)]

array([ 6,  9, 10])

## 15. 创建一个python函数来处理scalars并在numpy数组上工作

In [91]:
# np.vectorize 广义函数类
# 定义一个矢量化函数，该函数将对象或numpy数组的嵌套序列作为输入，并返回单个numpy数组或numpy数组的元组。
# 像python map函数一样，矢量化函数在输入数组的连续元组上评估pyfunc，除了它使用numpy的广播规则。
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

pair_max = np.vectorize(maxx, otypes=[float])

arr15_a = np.array([5, 7, 9, 8, 6, 4, 5])
arr15_b = np.array([6, 3, 4, 8, 9, 7, 1])

pair_max(arr15_a, arr15_b)


array([6., 7., 9., 8., 9., 7., 5.])

## 16. 交换二维数组中的两列

In [105]:
arr16 = np.arange(10, 19).reshape(3,3)
arr16

array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [106]:
arr16[:, [1, 0, 2]]  # 交换第1列和第2列 

array([[11, 10, 12],
       [14, 13, 15],
       [17, 16, 18]])

In [107]:
arr16[:, [0, 2, 1]]  # 交换第2列和第3列

array([[10, 12, 11],
       [13, 15, 14],
       [16, 18, 17]])

In [108]:
arr16[:, [2, 1, 0]]  # 交换第1列和第3列

array([[12, 11, 10],
       [15, 14, 13],
       [18, 17, 16]])

## 17. 交换二维数组中的两行

In [104]:
arr17 = np.arange(9).reshape(3,3)
arr17

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [110]:
arr17[[1, 0, 2], :]  # 交换第1行和第2行

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [111]:
arr17[[0, 2, 1], :]  # 交换第2行和第3行

array([[0, 1, 2],
       [6, 7, 8],
       [3, 4, 5]])

In [112]:
arr17[[2, 1, 0], :]  # 交换第1行和第3行

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

## 18. 反转二维数组中的行

In [115]:
arr18 = np.arange(9).reshape(3,3)
arr18

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [116]:
arr18[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

## 19. 反转二维数组中的列

In [118]:
arr19 = np.arange(9).reshape(3,3)
arr19

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [119]:
arr19[:,::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

## 20. 创建包含5到10之间随机浮动的二维数组

In [123]:
# Method 1
# 随机整数添加随机小数
rand_arr20 = np.random.randint(low=5, high=10, size=(5, 3)) + np.random.random((5,3))
rand_arr20

array([[8.42936439, 6.95873396, 6.56842923],
       [7.32991554, 9.67971839, 6.64496701],
       [7.76162523, 9.56736363, 7.50222729],
       [5.32725493, 9.18017167, 6.58540576],
       [5.64678535, 7.12242919, 9.50290056]])

In [125]:
rand_arr20 = np.random.uniform(5,10, size=(5,3))
rand_arr20

array([[7.04438943, 8.84979436, 9.06615184],
       [5.02935   , 6.11242243, 8.16736952],
       [8.24722023, 5.81363036, 7.38389007],
       [9.69221995, 5.861946  , 5.33927911],
       [9.36727262, 7.40124541, 8.39565837]])

## 21. 在Numpy数组中只打印后三位

In [128]:
rand_arr21 = np.random.random((5,3))
np.set_printoptions(precision=3)
rand_arr21

array([[0.643, 0.548, 0.388],
       [0.419, 0.476, 0.866],
       [0.19 , 0.919, 0.277],
       [0.581, 0.488, 0.615],
       [0.001, 0.701, 0.896]])

## 22. 通过e式科学记数法（如1e10）来打印一个numpy数组

In [130]:
np.random.seed(100)
rand_arr22 = np.random.random([3,3])/1e3
rand_arr22

array([[5.434e-04, 2.784e-04, 4.245e-04],
       [8.448e-04, 4.719e-06, 1.216e-04],
       [6.707e-04, 8.259e-04, 1.367e-04]])

## 23. 限制numpy数组输出中打印的项目数？

In [133]:
np.set_printoptions(threshold=6)
arr23 = np.arange(20)
arr23

array([ 0,  1,  2, ..., 17, 18, 19])

## 24. 打印完整的numpy数组而不截断

In [137]:
import sys
np.set_printoptions(threshold=sys.maxsize)
arr23

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

## 25. 导入数字和文本的数据集保持文本在numpy数组中完好无损

In [138]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
iris[:3]


array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

## 26. 从1维元组数组中提取特定列

In [143]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=str)
print(iris_1d.shape)
print(iris_1d)


(150, 5)
[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']
 ['4.6' '3.1' '1.5' '0.2' 'Iris-setosa']
 ['5.0' '3.6' '1.4' '0.2' 'Iris-setosa']
 ['5.4' '3.9' '1.7' '0.4' 'Iris-setosa']
 ['4.6' '3.4' '1.4' '0.3' 'Iris-setosa']
 ['5.0' '3.4' '1.5' '0.2' 'Iris-setosa']
 ['4.4' '2.9' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.1' '1.5' '0.1' 'Iris-setosa']
 ['5.4' '3.7' '1.5' '0.2' 'Iris-setosa']
 ['4.8' '3.4' '1.6' '0.2' 'Iris-setosa']
 ['4.8' '3.0' '1.4' '0.1' 'Iris-setosa']
 ['4.3' '3.0' '1.1' '0.1' 'Iris-setosa']
 ['5.8' '4.0' '1.2' '0.2' 'Iris-setosa']
 ['5.7' '4.4' '1.5' '0.4' 'Iris-setosa']
 ['5.4' '3.9' '1.3' '0.4' 'Iris-setosa']
 ['5.1' '3.5' '1.4' '0.3' 'Iris-setosa']
 ['5.7' '3.8' '1.7' '0.3' 'Iris-setosa']
 ['5.1' '3.8' '1.5' '0.3' 'Iris-setosa']
 ['5.4' '3.4' '1.7' '0.2' 'Iris-setosa']
 ['5.1' '3.7' '1.5' '0.4' 'Iris-setosa']
 ['4.6' '3.6' '1.0' '0.2' 'Iris-setosa']
 ['5.1' '3.3' '1.7' '0.5' 'Iris-setosa']
 ['4.8'

In [144]:
species = np.array([row[4] for row in iris_1d])
species

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versic

## 27. 将1维元组数组转换为2维numpy数组

In [147]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=str)
iris_1d


array([['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
       ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
       ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
       ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
       ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
       ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
       ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
       ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
       ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
       ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
       ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
       ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
       ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'],
       ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
       ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'],
       ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
       ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
       ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
       ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
       ['5.1

In [151]:
# Method 1 for循环取前四行
iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:4]

array([['5.1', '3.5', '1.4', '0.2'],
       ['4.9', '3.0', '1.4', '0.2'],
       ['4.7', '3.2', '1.3', '0.2'],
       ['4.6', '3.1', '1.5', '0.2']], dtype='<U3')

In [155]:
# Method 2 导入数据时默认导入前4行
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])