In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
data1 = [1,2,3]

In [3]:
data1

[1, 2, 3]

In [4]:
data2 = np.array([1,2,3])

In [5]:
data2

array([1, 2, 3])

In [6]:
data3 = pd.Series([1,2,3])

In [7]:
data3

0    1
1    2
2    3
dtype: int64

In [8]:
# 创建一组服从正态分布的数值型数据
data4 = np.random.normal(0,10,size=10)

In [9]:
data4

array([-12.40560225,  -8.70345216,  -4.97941255,  -1.94967252,
         7.28696093,  -7.06381858,  -2.51574862,   5.10898802,
        13.24926772,  -8.8636106 ])

In [10]:
# 创建一组服从均匀分布的分类数据(用户数字代表类别)
data5 = np.random.randint(0,10,size=10)

In [11]:
data5

array([1, 9, 8, 0, 6, 6, 9, 6, 2, 9])

## 集中趋势

#### 均值
$$\mu=\frac{\displaystyle\sum\limits_{i=1}^N{X_i}}{N}$$

In [12]:
def do_mean(nums):
    return sum(nums) / len(nums)

In [13]:
do_mean(data1)

2.0

In [14]:
np.mean(data1)

2.0

#### 中位数

In [15]:
def do_median(nums):
    length = len(nums)
    sorted_nums = sorted(nums)
    
    if length %2 :
        return sorted_nums[length//2]
    else:
        return (sorted_nums[length//2 -1] +sorted_nums[length//2]) /2

In [16]:
do_median(data5)

6.0

In [17]:
np.median(data5)

6.0

#### 众数

In [18]:
data5

array([1, 9, 8, 0, 6, 6, 9, 6, 2, 9])

In [19]:
np.bincount(data5)

array([1, 1, 1, 0, 0, 0, 3, 0, 1, 3], dtype=int64)

In [20]:
# # 我们可以看到x中最大的数为7，因此bin的数量为8，那么它的索引值为0->7
# x = np.array([0, 1, 1, 3, 2, 1, 7]) 

# # 索引0出现了1次，索引1出现了3次......索引5出现了0次......
# np.bincount(x)
# #因此，输出结果为：array([1, 3, 1, 1, 0, 0, 0, 1])


# # 我们可以看到x中最大的数为7，因此bin的数量为8，那么它的索引值为0->7
# x = np.array([7, 6, 2, 1, 4])

# # 索引0出现了0次，索引1出现了1次......索引5出现了0次......
# np.bincount(x)
# #输出结果为：array([0, 1, 1, 0, 1, 0, 1, 1])

In [21]:
# 先获取每个索引位置值在原始数据中出现的位置
counts = np.bincount(data5)

# 返回频次最大值在数列中的索引位置
mode_val = np.argmax(counts)

# 返回众数
print(mode_val)

6


In [22]:
# 使用scipy实现
stats.mode(data5)

ModeResult(mode=array([6]), count=array([3]))

In [23]:
stats.mode(data5)[0][0]

6

### 离散程度

#### 极差
$$R=\max{(X)}-\min{(X)}$$

In [24]:
def do_ptp(nums):
    return max(nums) - min(nums)

In [25]:
do_ptp(data4)

25.65486996126144

In [26]:
np.ptp(data4)

25.65486996126144

#### 方差
$$\sigma^2=\frac{\displaystyle\sum\limits_{i=1}^N (X_i-\mu)^2}{N}$$

In [27]:
def do_var(nums):
    nums_mean = do_mean(nums)
    res =0
    for num in nums:
        res += (num - nums_mean) ** 2
    res = res / len(nums)
    return res

In [28]:
do_var(data4)

60.43652801408882

In [29]:
np.var(data4)

60.43652801408882

#### 标准差
$$\sigma =\sqrt{\sigma^2}$$

In [30]:
def do_std(nums):
    return do_var(nums) ** 0.5

In [31]:
do_std(data4)

7.774093388562349

In [32]:
np.std(data4)

7.774093388562349

#### 变异系数
$$CV=\frac{\sigma}{\mu}$$

In [34]:
def do_cv(nums):
    return do_std(nums) / do_mean(nums)

In [35]:
do_cv(data4)

-3.7310692308360403

In [36]:
np.std(data4) / np.mean(data4)

-3.7310692308360407

### 偏差程度

#### Z-分数
$$Z_i=\frac{X_i-\mu}{\sigma}$$

In [37]:
# data4数据中第一个值的Z-分数
(data4[0] -np.mean(data4)) / np.std(data4) 

-1.3277422418733251

In [38]:
(data4 -np.mean(data4)) / np.std(data4) 

array([-1.32774224, -0.85152593, -0.37249392,  0.0172287 ,  1.20535869,
       -0.64061599, -0.055587  ,  0.92520089,  1.97230429, -0.87212749])

In [39]:
(data3-np.mean(data3)) / np.std(data3)

0   -1.224745
1    0.000000
2    1.224745
dtype: float64

### 相关程度

#### 协方差
$$Cov(X,Y)=\frac{\displaystyle\sum\limits_{i=1}^N (X_i-\bar{X})(Y_i-\bar{Y})}{N}$$

In [40]:
data_new = np.array([data4,data5])

In [41]:
data_new

array([[-12.40560225,  -8.70345216,  -4.97941255,  -1.94967252,
          7.28696093,  -7.06381858,  -2.51574862,   5.10898802,
         13.24926772,  -8.8636106 ],
       [  1.        ,   9.        ,   8.        ,   0.        ,
          6.        ,   6.        ,   9.        ,   6.        ,
          2.        ,   9.        ]])

In [42]:
def do_cov(num_matrix):
    length = len(num_matrix)
    
    res = [[0 for _ in range(length)] for _ in range(length)]
    for m in range(length):
        for n in range(length):
            res[m][n] = sum([(x- do_mean(num_matrix[m]))*(y -do_mean(num_matrix[n])) for x,y in zip(num_matrix[m],num_matrix[n])]) / len(num_matrix[m])
            
    return np.array(res)

In [43]:
do_cov(data_new)

array([[60.43652801, -5.7812724 ],
       [-5.7812724 , 10.64      ]])

In [44]:
# 计算data4与data5两组数据(变量)间的协方差
# 参数bias=1表示结果需要除以N（默认是除以N-1）
# 返回结果为矩阵，第i行第j列的数据表示第i组数与第j组数的协方差，对角线为方差
np.cov(data_new,bias=1)

array([[60.43652801, -5.7812724 ],
       [-5.7812724 , 10.64      ]])

#### 相关系数
$$r(X,Y)=\frac{Cov(X,Y)}{\sigma_X\sigma_Y}$$

In [46]:
def do_corr(num_matrix):
    length = len(num_matrix)
    res = do_cov(num_matrix)
    for m in range(length):
        for n in range(length):
            res[m][n] = res[m][n]  / (do_std(num_matrix[m] * do_std(num_matrix[n])))
    return res

In [47]:
do_corr(data_new)

array([[ 1.        , -0.22798321],
       [-0.22798321,  1.        ]])

In [48]:
np.corrcoef(data_new)

array([[ 1.        , -0.22798321],
       [-0.22798321,  1.        ]])