# Pandas的函数应用

In [3]:
import numpy as np
import pandas as pd

## 1.apply 和 applymap

In [8]:
#1.1 可以直接使用numpy的函数
df = pd.DataFrame(np.random.randn(5,4))
df

Unnamed: 0,0,1,2,3
0,0.237529,-0.542464,0.354658,-1.182168
1,0.481535,-0.642374,2.160597,-0.551493
2,1.301553,-0.187608,0.50256,-0.938298
3,-0.519396,1.525015,0.619676,1.467363
4,-0.217324,0.748628,0.192864,1.147797


In [9]:
np.abs(df)

Unnamed: 0,0,1,2,3
0,0.237529,0.542464,0.354658,1.182168
1,0.481535,0.642374,2.160597,0.551493
2,1.301553,0.187608,0.50256,0.938298
3,0.519396,1.525015,0.619676,1.467363
4,0.217324,0.748628,0.192864,1.147797


In [10]:
#1.2 通过apply将函数应用到列或行
f = lambda x:x.max()
df.apply(f)

# 注意轴的方向  默认axis0 列

0    1.301553
1    1.525015
2    2.160597
3    1.467363
dtype: float64

In [11]:
df.apply(f,axis=1)

0    0.354658
1    2.160597
2    1.301553
3    1.525015
4    1.147797
dtype: float64

In [12]:
#1.3通过applymap将函数应用到每个数据
f2 = lambda x:'%.2f'%x 
df.applymap(f2)

Unnamed: 0,0,1,2,3
0,0.24,-0.54,0.35,-1.18
1,0.48,-0.64,2.16,-0.55
2,1.3,-0.19,0.5,-0.94
3,-0.52,1.53,0.62,1.47
4,-0.22,0.75,0.19,1.15


## 2.排序

### 2.1 索引排序

In [13]:
s1 = pd.Series(np.arange(4),index=list('dbca'))
s1

d    0
b    1
c    2
a    3
dtype: int32

In [14]:
s1.sort_index()  #默认升序

a    3
b    1
c    2
d    0
dtype: int32

In [15]:
s1.sort_index(ascending = False)  #降序

d    0
c    2
b    1
a    3
dtype: int32

In [17]:
pd1 = pd.DataFrame(np.arange(12).reshape(4,3),index=list('bdca'),columns = list('BCA'))
pd1

Unnamed: 0,B,C,A
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [19]:
#按照行排序
pd1.sort_index()

Unnamed: 0,B,C,A
a,9,10,11
b,0,1,2
c,6,7,8
d,3,4,5


In [20]:
#按照列排序
pd1.sort_index(axis=1)

Unnamed: 0,A,B,C
b,2,0,1
d,5,3,4
c,8,6,7
a,11,9,10


### 2.2 按值排序

In [23]:
s1
s1['a'] = np.nan
s1

d    0.0
b    1.0
c    2.0
a    NaN
dtype: float64

In [24]:
s1.sort_values()  #根据值的大小进行排序，当有缺失值,会默认排最后

d    0.0
b    1.0
c    2.0
a    NaN
dtype: float64

In [25]:
s1.sort_values(ascending=False)

c    2.0
b    1.0
d    0.0
a    NaN
dtype: float64

In [26]:
pd1

Unnamed: 0,B,C,A
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [30]:
pd1.sort_values(by=['A','B'])  #按照指定列排序

Unnamed: 0,B,C,A
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [31]:
pd2 = pd.DataFrame({'a':[3,7,9,0],'b':[1,-1,4,8],'c':[0,6,-3,2]})
pd2

Unnamed: 0,a,b,c
0,3,1,0
1,7,-1,6
2,9,4,-3
3,0,8,2


In [32]:
pd2.sort_values(by='b')  #指定b列排序

Unnamed: 0,a,b,c
1,7,-1,6
0,3,1,0
2,9,4,-3
3,0,8,2


In [35]:
pd2.sort_values(by=['a','c'],ascending=False)  #指定多列排序

Unnamed: 0,a,b,c
2,9,4,-3
1,7,-1,6
0,3,1,0
3,0,8,2


## 3.唯一值和成员属性

In [39]:
s1 = pd.Series([2,6,8,9,8,3,6],index=['a','a','c','c','c','c','c'])
s1

a    2
a    6
c    8
c    9
c    8
c    3
c    6
dtype: int64

In [37]:
#返回一个series中的唯一值
s2=s1.unique()   #返回一个数组
s2

array([2, 6, 8, 9, 3], dtype=int64)

In [40]:
s1.index.is_unique

False

In [41]:
s1 = pd.Series([2,6,8,9,8,3,6])
s1

0    2
1    6
2    8
3    9
4    8
5    3
6    6
dtype: int64

In [42]:
#计算series值的个数
s1.value_counts()   #返回一个series

6    2
8    2
3    1
2    1
9    1
dtype: int64

In [43]:
#isin  判断值是否存在   返回布尔类型
s1.isin([8])  #判断8是否存在s1

0    False
1    False
2     True
3    False
4     True
5    False
6    False
dtype: bool

In [44]:
#判断多个值
s1.isin([8,2]) 

0     True
1    False
2     True
3    False
4     True
5    False
6    False
dtype: bool

In [45]:
data = pd.DataFrame({'a':[3,7,9,0],'b':[1,-1,4,8],'c':[0,6,-3,2]})
data

Unnamed: 0,a,b,c
0,3,1,0
1,7,-1,6
2,9,4,-3
3,0,8,2


In [46]:
data.isin([2,4])

Unnamed: 0,a,b,c
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,True


## 4.处理缺失数据

In [48]:
df3 = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                    [np.nan, 4., np.nan], [1., 2., 3.]])
df3

Unnamed: 0,0,1,2
0,-0.053502,-0.83705,-0.303284
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [49]:
# 1.判断是否存在缺失值 isnull()
df3.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,True
2,True,False,True
3,False,False,False


In [50]:
# 2.丢弃缺失数据 dropna()
df3.dropna()  #默认丢弃行

Unnamed: 0,0,1,2
0,-0.053502,-0.83705,-0.303284
3,1.0,2.0,3.0


In [51]:
df3.dropna(axis=1)

Unnamed: 0,1
0,-0.83705
1,2.0
2,4.0
3,2.0


In [53]:
# 3.填充缺失数据
df3.fillna(-100.)

Unnamed: 0,0,1,2
0,-0.053502,-0.83705,-0.303284
1,1.0,2.0,-100.0
2,-100.0,4.0,-100.0
3,1.0,2.0,3.0
