In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
SEED = 20201125
np.random.seed(SEED)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

# 次序统计
## 计算最小值

In [2]:
x = np.array([[11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20],
              [21, 22, 23, 24, 25],
              [26, 27, 28, 29, 30],
              [31, 32, 33, 34, 1]])
y = np.amin(x) # 找到数组中最小的一个数字
print(y)
y = np.amin(x,axis = 0) # 取每列的最小值
print(y)
y = np.amin(x,axis=1)  # 取每行的最小值
print(y)

1
[11 12 13 14  1]
[11 16 21 26  1]


## 计算最大值

In [3]:
y = np.amax(x) # 找到数组中最小的一个数字
print(y)
y = np.amax(x,axis = 0) # 取每列的最大值
print(y)
y = np.amax(x,axis=1)  # 取每行的最大值
print(y)

34
[31 32 33 34 30]
[15 20 25 30 34]


## 计算极差

In [4]:
np.random.seed(SEED)
x = np.random.randint(0,20,size=(4,5))
print(x)

[[ 3 13 15  6 11]
 [13  8 17 18  4]
 [19  1 14 17  3]
 [13 17  4 14  7]]


In [5]:
print(np.ptp(x))
print(np.ptp(x,axis=0)) # 计算每一列的极差
print(np.ptp(x,axis=1)) # 计算每一行的极差

18
[16 16 13 12  8]
[12 14 18 13]


## 计算分位数

In [6]:
np.random.seed(SEED)
x = np.random.randint(0,20,size=(4,5))
print(x)

[[ 3 13 15  6 11]
 [13  8 17 18  4]
 [19  1 14 17  3]
 [13 17  4 14  7]]


In [7]:
print(np.percentile(x,[25,50])) 
print(np.percentile(x,[25,50],axis=0)) # 每列的百分25数，百分50数
print(np.percentile(x,[25,50],axis=1)) # 每行的百分25数，百分50数

[ 5.5 13. ]
[[10.5   6.25 11.5  12.    3.75]
 [13.   10.5  14.5  15.5   5.5 ]]
[[ 6.  8.  3.  7.]
 [11. 13. 14. 13.]]


# 均值与方差
## 计算中位数

In [8]:
np.random.seed(SEED)
x = np.random.randint(0,20,size=(4,5))
print(x)
print(np.median(x))

[[ 3 13 15  6 11]
 [13  8 17 18  4]
 [19  1 14 17  3]
 [13 17  4 14  7]]
13.0


In [9]:
print(np.median(x,axis=0)) # 每列的中位数

[13.  10.5 14.5 15.5  5.5]


In [10]:
print(np.median(x,axis=1)) # 每列的中位数

[11. 13. 14. 13.]


## 平均值

In [11]:
x = np.random.randint(0,20,size=(4,5))
print(x)
print(np.mean(x))

[[ 8 19  6 14  0]
 [ 1  1  9  9 12]
 [13 15  1 11  7]
 [ 3  8  5  8  9]]
7.95


In [12]:
print(np.mean(x,axis=0)) # 每一列的平均值

[ 6.25 10.75  5.25 10.5   7.  ]


In [13]:
print(np.mean(x,axis=1)) # 每一行的平均值

[9.4 6.4 9.4 6.6]


## 加权平均值

In [14]:
x = np.random.randint(0,20,size=(4,5))
print(x)
print(np.average(x))
print(np.average(x,axis=0))
print(np.average(x,axis=1))

[[ 2  8  2 14 18]
 [ 4  3  2  2  8]
 [ 6 18  5 15  1]
 [ 5 19 12 16 11]]
8.55
[ 4.25 12.    5.25 11.75  9.5 ]
[ 8.8  3.8  9.  12.6]


In [15]:
w = np.random.randint(0,10,size=(4,5))
print(w)
print(np.average(x,weights=w))
print(np.average(x,weights=w,axis=0))
print(np.average(x,weights=w,axis=1))

[[4 1 2 9 5]
 [4 5 0 5 4]
 [3 8 5 9 3]
 [5 1 3 0 1]]
9.415584415584416
[ 4.1875     12.4         6.5        11.7826087  10.46153846]
[11.23809524  4.05555556 11.60714286  9.1       ]


## 计算方差

In [16]:
np.random.seed(SEED)
x = np.random.randint(10,30,size=(5,5))
print(x)
print(np.mean(x))
y = np.var(x)
print(y)
y = np.mean((x-np.mean(x))**2)
print(y)

[[13 23 25 16 21]
 [23 18 27 28 14]
 [29 11 24 27 13]
 [23 27 14 24 17]
 [18 29 16 24 10]]
20.56
34.6464
34.6464


In [17]:
y = np.var(x,ddof=1)
print(y)
y = np.sum((x-np.mean(x))**2)/(x.size-1)
print(y)
print(x.size)

36.09
36.09
25


In [18]:
print(np.var(x,axis=0))
print(np.var(x,axis=1))

[28.96 42.24 26.96 17.76 14.  ]
[19.84 28.4  54.56 22.8  43.04]


## 计算标准差

In [19]:
np.random.seed(SEED)
x = np.random.randint(10,30,size=(5,5))
print(x)
y = np.std(x)
print(y)
y = np.sqrt(np.var(x))
print(y)

[[13 23 25 16 21]
 [23 18 27 28 14]
 [29 11 24 27 13]
 [23 27 14 24 17]
 [18 29 16 24 10]]
5.886119264846746
5.886119264846746


In [22]:
print(np.std(x,axis=0))
print(np.std(x,axis=1))

[5.38144962 6.49923072 5.19230199 4.2142615  3.74165739]
[4.45421149 5.32916504 7.38647413 4.77493455 6.56048779]


## 计算协方差矩阵

In [20]:
x = [1,2,3,4,6]
y = [0,2,5,6,7]
print(np.cov(x)) # 样本方差
print(np.cov(y)) # 样本方差
print(np.cov(x,y)) # 协方差

3.7
8.5
[[3.7  5.25]
 [5.25 8.5 ]]


In [21]:
print(np.var(x))
print(np.mean((x-np.mean(x))**2))
print(np.var(x,ddof=1))
print(np.sum((x-np.mean(x))**2)/(len(x)-1))

2.96
2.96
3.7
3.7


In [22]:
print(np.var(y))
print(np.mean((y-np.mean(y))**2))
print(np.var(y,ddof=1))
print(np.sum((y-np.mean(y))**2)/(len(y)-1))

6.8
6.8
8.5
8.5


In [23]:
x = [1,2,3,4,6]
y = [0,2,5,6,7]
print(np.dot( x-np.mean(x),y-np.mean(y) ) /(len(x)-1))

5.25


In [24]:
x = [1,2,3,4,6]
y = [0,2,5,6,7]
print(np.dot(x,y)/len(x)-np.mean(x)*np.mean(y))

4.199999999999999


## 计算相关系数

In [25]:
np.random.seed(SEED)
x,y = np.random.randint(0,20,size=(2,4))
print(x)
print(y)
z = np.corrcoef(x,y)
print(z)
a = np.dot(x-np.mean(x),y-np.mean(y))/len(x)
b = np.sqrt(np.dot(x-np.mean(x),x-np.mean(x))/(len(x)))
c = np.sqrt(np.dot(y-np.mean(y),y-np.mean(y))/(len(y)))
# print(a)
# print(b)
# print(c)
print(a/(b*c))

[ 3 13 15  6]
[11 13  8 17]
[[ 1.         -0.45481243]
 [-0.45481243  1.        ]]
-0.4548124285518578


In [26]:
a = np.dot(x-np.mean(x),y-np.mean(y))
b = np.sqrt(np.dot(x-np.mean(x),x-np.mean(x)))
c = np.sqrt(np.dot(y-np.mean(y),y-np.mean(y)))
a/b/c

-0.4548124285518577

## 直方图

In [27]:
x = np.array([0.2, 6.4, 3.0, 1.6])
bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
inds = np.digitize(x,bins)
print(inds)

[1 4 3 2]


In [28]:
for i in range(x.size):
    print(bins[inds[i]-1],"<=",x[i],"<=",bins[inds[i]])

0.0 <= 0.2 <= 1.0
4.0 <= 6.4 <= 10.0
2.5 <= 3.0 <= 4.0
1.0 <= 1.6 <= 2.5


# 练习题

In [29]:
# 线性回归
X = [[1, 6, 2] , [1, 8, 1] , [1, 10, 0] , [1 , 14, 2] , [1, 18, 0]] 
y = [[7] , [9] , [13] , [17.5] , [18]]

In [30]:
print(X)
print(y)

[[1, 6, 2], [1, 8, 1], [1, 10, 0], [1, 14, 2], [1, 18, 0]]
[[7], [9], [13], [17.5], [18]]


In [35]:
# 正规方程解法
print(np.dot(np.linalg.inv(np.dot(np.transpose(X),X)),np.dot(np.transpose(X),y)))

[[1.1875    ]
 [1.01041667]
 [0.39583333]]


In [37]:
# 最小二乘法
np.linalg.lstsq(X,y)[0]

  


array([[1.1875    ],
       [1.01041667],
       [0.39583333]])

In [39]:
#计算给定数组中每行的最大值。如何在二维numpy数组的每一行中找到最大值？

a = np.random.randint(1, 10, [5, 3])
print(a)
np.amax(a,axis=1)


[[2 2 6]
 [5 2 8]
 [4 9 6]
 [9 3 5]
 [9 3 9]]


array([6, 8, 9, 9, 9])