In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.datasets import load_iris
SEED = 20201201
import pandas as pd
np.random.seed(SEED)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

**1. 导入鸢尾属植物数据集**

In [2]:
iris = load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
print(iris.data)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [23]:
cols_list = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
df_iris = pd.DataFrame(iris.data, columns=cols_list)

In [24]:
df_iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [25]:
df_iris['SepalLength']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: SepalLength, Length: 150, dtype: float64

**2. 求出鸢尾属植物萼片长度的平均值、中位数和标准差（第1列，sepallength）**

In [26]:
# 均值
print(np.mean(df_iris['SepalLength']))
# 中位数
print(np.median(df_iris['SepalLength']))
# 标准差
print(np.std(df_iris['SepalLength']))

5.843333333333335
5.8
0.8253012917851409


**3. 创建一种标准化形式的鸢尾属植物萼片长度，其值正好介于0和1之间，这样最小值为0，最大值为1（第1列，sepallength）**

In [27]:
# 标准化
minv = np.amin(df_iris['SepalLength'])
print(minv)
maxv = np.amax(df_iris['SepalLength'])
print(maxv)
df_iris['SepalLength_std'] = (df_iris['SepalLength']-minv)/(maxv-minv)

4.3
7.9


In [28]:
df_iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,SepalLength_std
0,5.1,3.5,1.4,0.2,0.222222
1,4.9,3.0,1.4,0.2,0.166667
2,4.7,3.2,1.3,0.2,0.111111
3,4.6,3.1,1.5,0.2,0.083333
4,5.0,3.6,1.4,0.2,0.194444
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.666667
146,6.3,2.5,5.0,1.9,0.555556
147,6.5,3.0,5.2,2.0,0.611111
148,6.2,3.4,5.4,2.3,0.527778


In [29]:
df_iris['SepalLength_std']

0      0.222222
1      0.166667
2      0.111111
3      0.083333
4      0.194444
         ...   
145    0.666667
146    0.555556
147    0.611111
148    0.527778
149    0.444444
Name: SepalLength_std, Length: 150, dtype: float64

**4. 找到鸢尾属植物萼片长度的第5和第95百分位数（第1列，sepallength）**

In [30]:
x = np.percentile(df_iris['SepalLength'],[5,95])

In [31]:
print(x)

[4.6   7.255]


**5.把iris_data数据集中的20个随机位置修改为np.nan值**

In [63]:
x_size,y_size =   150,4
na_x,na_y = np.random.randint(x_size,size=20),np.random.randint(y_size,size=20)
for i in range(20):
    df_iris.loc[na_x[i]][na_y[i]] = np.nan
df_iris.head(10)

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,SepalLength_std
0,,3.5,1.4,0.2,0.222222
1,,,1.4,,0.166667
2,4.7,,1.3,0.2,0.111111
3,,,1.5,0.2,0.083333
4,,,1.4,0.2,0.194444
5,,,1.7,0.4,0.305556
6,4.6,,1.4,0.3,0.083333
7,5.0,3.4,1.5,0.2,0.194444
8,,,1.4,0.2,0.027778
9,4.9,3.1,1.5,0.1,0.166667


**6. 在iris_data的sepallength中查找缺失值的个数和位置（第1列）**

In [88]:
df_iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,SepalLength_std
0,,3.5,1.4,0.2,0.222222
1,,,1.4,,0.166667
2,4.7,,1.3,0.2,0.111111
3,,,1.5,0.2,0.083333
4,,,1.4,0.2,0.194444
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.666667
146,6.3,2.5,5.0,1.9,0.555556
147,6.5,3.0,5.2,2.0,0.611111
148,6.2,3.4,5.4,2.3,0.527778


In [86]:
# 用numpy里的isnan做
x = np.isnan(df_iris)['SepalLength']
print(np.sum(x))

12


In [92]:
# 用pandas里的isna()
df_iris.isna()['SepalLength']

0       True
1       True
2      False
3       True
4       True
       ...  
145    False
146    False
147    False
148    False
149     True
Name: SepalLength, Length: 150, dtype: bool

**7.筛选具有 sepallength（第1列）< 5.0 并且 petallength（第3列）> 1.5 的 iris_data行**

In [94]:
index = np.where(np.logical_and(df_iris['SepalLength']<5.0,df_iris['PetalLength']>1.5))
print(df_iris.loc[index])

     SepalLength  SepalWidth  PetalLength  PetalWidth  SepalLength_std
11           4.8         3.4          1.6         0.2         0.138889
24           4.8         3.4          1.9         0.2         0.138889
29           4.7         3.2          1.6         0.2         0.111111
30           4.8         3.1          1.6         NaN         0.138889
57           4.9         2.4          3.3         1.0         0.166667
106          4.9         2.5          4.5         1.7         0.166667


In [100]:
# 用pandas的方法做
print(df_iris[(df_iris['SepalLength']<5.0) & (df_iris['PetalLength']>1.5)])

     SepalLength  SepalWidth  PetalLength  PetalWidth  SepalLength_std
11           4.8         3.4          1.6         0.2         0.138889
24           4.8         3.4          1.9         0.2         0.138889
29           4.7         3.2          1.6         0.2         0.111111
30           4.8         3.1          1.6         NaN         0.138889
57           4.9         2.4          3.3         1.0         0.166667
106          4.9         2.5          4.5         1.7         0.166667


**8. 选择没有任何 nan 值的 iris_data行**

In [101]:
print(df_iris[np.sum(df_iris.isna(),axis=1)==0])

     SepalLength  SepalWidth  PetalLength  PetalWidth  SepalLength_std
7            5.0         3.4          1.5         0.2         0.194444
9            4.9         3.1          1.5         0.1         0.166667
10           5.4         3.7          1.5         0.2         0.305556
11           4.8         3.4          1.6         0.2         0.138889
12           4.8         3.0          1.4         0.1         0.138889
..           ...         ...          ...         ...              ...
144          6.7         3.3          5.7         2.5         0.666667
145          6.7         3.0          5.2         2.3         0.666667
146          6.3         2.5          5.0         1.9         0.555556
147          6.5         3.0          5.2         2.0         0.611111
148          6.2         3.4          5.4         2.3         0.527778

[125 rows x 5 columns]


**9.计算 iris_data 中sepalLength（第1列）和petalLength（第3列）之间的相关系数。**

In [104]:
cols_list = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
df_iris = pd.DataFrame(iris.data, columns=cols_list)

In [105]:
x = np.corrcoef(df_iris['SepalLength'],df_iris['PetalLength'])

In [106]:
print(x)

[[1.         0.87175378]
 [0.87175378 1.        ]]


**10. 找出iris_data是否有任何缺失值**

In [111]:
df_iris[np.sum(df_iris.isna(),axis=1)>=1]

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth


**11. 在numpy数组中将所有出现的nan替换为0**

In [112]:
df_iris[df_iris.isna()]= 0

**12. 找出鸢尾属植物物种中的唯一值和唯一值出现的数量**

In [115]:
num,cnt = np.unique(df_iris,return_counts=True)
print(num[cnt==1])

[0.5 0.6 7.  7.1 7.3 7.4 7.6 7.9]


**13. 将 iris_data 的花瓣长度（第3列）以形成分类变量的形式显示。定义：Less than 3 -->
'small'；3-5 --> 'medium'；'>=5 --> 'large'。**

In [120]:
def map_fun(x):
    if x<=3:
        return 'small';
    elif x<5:
        return 'medium';
    else:
        return 'large';
df_iris['PetalLength'].apply(lambda x:map_fun(x))

0      small
1      small
2      small
3      small
4      small
       ...  
145    large
146    large
147    large
148    large
149    large
Name: PetalLength, Length: 150, dtype: object

**14. 在 iris_data 中创建一个新列，其中 volume 是 (pi x petallength x sepallength ^ 2）/ 3 。**

In [122]:
df_iris['volume'] = (np.pi*df_iris['PetalLength']*df_iris['SepalLength']**2)/3
df_iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,volume
0,5.1,3.5,1.4,0.2,38.132652
1,4.9,3.0,1.4,0.2,35.200498
2,4.7,3.2,1.3,0.2,30.072372
3,4.6,3.1,1.5,0.2,33.238050
4,5.0,3.6,1.4,0.2,36.651914
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,244.445230
146,6.3,2.5,5.0,1.9,207.816354
147,6.5,3.0,5.2,2.0,230.069302
148,6.2,3.4,5.4,2.3,217.373079


**15. 随机抽鸢尾属植物的种类，使得Iris-setosa的数量是Iris-versicolor和Iris-virginica数量的两
倍。**

In [124]:
species = np.array(['Iris‐setosa', 'Iris‐versicolor', 'Iris‐virginica'])
species_sample  = np.random.choice(species,10000,p=[0.5,0.25,0.25])
print(np.unique(species_sample,return_counts=True))

(array(['Iris‐setosa', 'Iris‐versicolor', 'Iris‐virginica'], dtype='<U15'), array([5025, 2561, 2414], dtype=int64))


**16. 根据 sepallength 列对数据集进行排序**

In [125]:
index = np.argsort(df_iris['SepalLength'])
print(index)

0       13
1       42
2       38
3        8
4       41
      ... 
145    122
146    118
147    117
148    135
149    131
Name: SepalLength, Length: 150, dtype: int64


In [126]:
df_iris.loc[index]

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,volume
13,4.3,3.0,1.1,0.1,21.298951
42,4.4,3.2,1.3,0.2,26.355868
38,4.4,3.0,1.3,0.2,26.355868
8,4.4,2.9,1.4,0.2,28.383242
41,4.5,2.3,1.3,0.3,27.567476
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,415.991897
118,7.7,2.6,6.9,2.3,428.409565
117,7.7,3.8,6.7,2.2,415.991897
135,7.7,3.0,6.1,2.3,378.738891


**17. 在鸢尾属植物数据集中找到最常见的花瓣长度值（第3列）**

In [127]:
vals, counts = np.unique(df_iris['PetalLength'],return_counts=True)
# 花瓣长度值
print(vals[np.argmax(counts)])
# 出现次数
print(np.amax(counts))

1.4
13


**18. 在鸢尾花数据集的 petalwidth（第4列）中查找第一次出现的值大于1.0的位置**

In [129]:
index = np.where(df_iris['PetalWidth']>1.0)
print(index)
print(index[0][0])

(array([ 50,  51,  52,  53,  54,  55,  56,  58,  59,  61,  63,  64,  65,
        66,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  80,
        82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  94,  95,
        96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
       109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
       122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       148, 149], dtype=int64),)
50
