## 1.4 Series 的常用方法


In [6]:
import numpy as np
import pandas as pd
from pandas.io.pytables import dropna_doc

arr = np.array([10,20,30,30,30,40,40,50,60,60,60,60,70,90,None])
s = pd.Series(arr, name="example_value")
print(s.index)
print(s.values)
print(s.name)
print(s.dtype)
print(s.size)
print(s.shape)

RangeIndex(start=0, stop=15, step=1)
[10 20 30 30 30 40 40 50 60 60 60 60 70 90 None]
example_value
object
15
(15,)


#### 1. 基本统计方法，用于数值的最大/最小/平均/中位数/众数等计算

In [11]:
print(f"Total size(include None) of s: {s.size}, total values(None not included) of s: {s.count()}")
print(f"Max of s: {s.max()}, Min of s: {s.min()}")
print(f"Sum of s: {s.sum()}, mean of s: {s.mean()}")
print(f"Median of s: {s.median()}")
print(f"Mode of s: {s.mode().values}")  # 众数可能有多个
print(f"Standard deviation of s: {s.std()}, Variance of s: {s.var()}")

# describe 方法一次性返回多个统计值
print(f"Describe of s:\n{s.describe()}")

Total size(include None) of s: 15, total values(None not included) of s: 14
Max of s: 90, Min of s: 10
Sum of s: 650, mean of s: 46.42857142857143
Median of s: 45.0
Mode of s: [60]
Standard deviation of s: 21.699749327564152, Variance of s: 470.87912087912093
Describe of s:
count     14
unique     8
top       60
freq       4
Name: example_value, dtype: int64


#### 2. 值判断 & 值计数方法，主要解决“出现了哪些不同的值，分别出现了几次，值是否唯一”

##### 2.1 .unique(): 去重，返回一个包含唯一值的ndarray, 默认包含NaN

In [14]:
u = s.unique()
print(u)

[10 20 30 40 50 60 70 90 None]


##### 2.2 .nunique(): 计算唯一值的数量，默认不包含NaN，可选参数 dropna 控制是否包含 NaN

In [16]:
n_unique_include_na = s.nunique()
print(n_unique_include_na)
# 包含nan的情况
n_unique = s.nunique(dropna=False)
print(n_unique)

8
9


##### 2.3 .value_counts(): 计算每个值出现的次数，返回一个新的Series，索引是原Series中的值，值为出现次数，默认是不包含nan，可通过参数控制

In [19]:
v_count = s.value_counts()
print(v_count)
v_count_include_na = s.value_counts(dropna=False)
print(v_count_include_na)
# 计算每个值出现的频率
v_count_include_frec = s.value_counts(dropna=False, normalize=True)
print(v_count_include_frec)

example_value
60    4
30    3
40    2
10    1
20    1
50    1
70    1
90    1
Name: count, dtype: int64
example_value
60      4
30      3
40      2
10      1
20      1
50      1
70      1
90      1
None    1
Name: count, dtype: int64
example_value
60      0.266667
30      0.200000
40      0.133333
10      0.066667
20      0.066667
50      0.066667
70      0.066667
90      0.066667
None    0.066667
Name: proportion, dtype: float64


#### 3. 处理Nan的方法

##### 3.1 缺失值判断，返回Bool Series，可以用于Bool索引过滤

In [21]:
f_isna = s.isna()
print(f_isna)
f_notna = s.notna()
print(f_notna)

s_isna = s[f_isna]
print(s_isna)
s_notna = s[f_notna]
print(s_notna)


0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14     True
Name: example_value, dtype: bool
0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14    False
Name: example_value, dtype: bool
14    None
Name: example_value, dtype: object
0     10
1     20
2     30
3     30
4     30
5     40
6     40
7     50
8     60
9     60
10    60
11    60
12    70
13    90
Name: example_value, dtype: object


##### 3.2 丢弃缺失值，返回一个新的Series

In [22]:
s_notna = s.dropna()
print(s_notna)

0     10
1     20
2     30
3     30
4     30
5     40
6     40
7     50
8     60
9     60
10    60
11    60
12    70
13    90
Name: example_value, dtype: object


##### 3.3 缺失值填充，返回一个新的Series

In [25]:
s_filled = s.fillna(0)
print(s_filled)
s_ffilled = s.ffill()
print(s_ffilled)
s_bfilled = s.bfill() # 由于最后一个值是NaN，无法向后填充，仍然是NaN
print(s_bfilled)

0     10
1     20
2     30
3     30
4     30
5     40
6     40
7     50
8     60
9     60
10    60
11    60
12    70
13    90
14     0
Name: example_value, dtype: int64
0     10
1     20
2     30
3     30
4     30
5     40
6     40
7     50
8     60
9     60
10    60
11    60
12    70
13    90
14    90
Name: example_value, dtype: int64
0     10.0
1     20.0
2     30.0
3     30.0
4     30.0
5     40.0
6     40.0
7     50.0
8     60.0
9     60.0
10    60.0
11    60.0
12    70.0
13    90.0
14     NaN
Name: example_value, dtype: float64


  s_filled = s.fillna(0)
  s_ffilled = s.ffill()
  s_bfilled = s.bfill() # 由于最后一个值是NaN，无法向后填充，仍然是NaN


#### 4. 排序方法

##### 4.1 按值排序，返回一个新的Series

In [27]:
s_sorted_value = s.sort_values()
print(s_sorted_value)
s_sorted_value_desc = s.sort_values(ascending=False)
print(s_sorted_value_desc)

0       10
1       20
2       30
3       30
4       30
5       40
6       40
7       50
8       60
9       60
10      60
11      60
12      70
13      90
14    None
Name: example_value, dtype: object
13      90
12      70
8       60
9       60
10      60
11      60
7       50
5       40
6       40
2       30
3       30
4       30
1       20
0       10
14    None
Name: example_value, dtype: object


##### 4.2 按照索引排序，返回一个新的Series

In [29]:
s_sorted_index = s.sort_index()
print(s_sorted_index)
s_sorted_index_desc = s.sort_index(ascending=False)
print(s_sorted_index_desc)

0       10
1       20
2       30
3       30
4       30
5       40
6       40
7       50
8       60
9       60
10      60
11      60
12      70
13      90
14    None
Name: example_value, dtype: object
14    None
13      90
12      70
11      60
10      60
9       60
8       60
7       50
6       40
5       40
4       30
3       30
2       30
1       20
0       10
Name: example_value, dtype: object


##### 4.3 排名，针对重复值有不同的处理方式，返回不包含重复值的Series排名, index与原Series一致，value为排名

In [30]:
s_rank = s.rank(method='average') # 针对重复值，取平均排名
print(s_rank)
s_rank_min = s.rank(method='min') # 针对重复值，取最小排名
print(s_rank_min)
s_rank_max = s.rank(method='max') # 针对重复值，取最大排名
print(s_rank_max)


0      1.0
1      2.0
2      4.0
3      4.0
4      4.0
5      6.5
6      6.5
7      8.0
8     10.5
9     10.5
10    10.5
11    10.5
12    13.0
13    14.0
14     NaN
Name: example_value, dtype: float64
0      1.0
1      2.0
2      3.0
3      3.0
4      3.0
5      6.0
6      6.0
7      8.0
8      9.0
9      9.0
10     9.0
11     9.0
12    13.0
13    14.0
14     NaN
Name: example_value, dtype: float64
0      1.0
1      2.0
2      5.0
3      5.0
4      5.0
5      7.0
6      7.0
7      8.0
8     12.0
9     12.0
10    12.0
11    12.0
12    13.0
13    14.0
14     NaN
Name: example_value, dtype: float64
