## 常用操作

In [23]:
import pandas as pd
import numpy as np

In [2]:
example = pd.DataFrame({'Amount': [74., 235., 175., 100., 115., 245., 180., 90., 88., 129., 273., 300.],
              'Category': ['Transportation', 'Grocery', 'Household', 'Entertainment', 
                           'Transportation', 'Grocery', 'Household', 'Entertainment', 
                           'Transportation', 'Grocery', 'Household', 'Entertainment'],
              'Month': ['January', 'January', 'January', 'January', 
                        'February', 'February', 'February', 'February', 
                        'Marth', 'Marth', 'Marth', 'Marth']})

In [3]:
example

Unnamed: 0,Amount,Category,Month
0,74.0,Transportation,January
1,235.0,Grocery,January
2,175.0,Household,January
3,100.0,Entertainment,January
4,115.0,Transportation,February
5,245.0,Grocery,February
6,180.0,Household,February
7,90.0,Entertainment,February
8,88.0,Transportation,Marth
9,129.0,Grocery,Marth


##### 1. 排序操作

In [5]:
example.sort_values(by=['Month', 'Amount'], ascending=[True, False])   # by 指定按照哪些属性排序 Month放在第一个 先把Month 按照升序排序，再对Amount降序排序

Unnamed: 0,Amount,Category,Month
5,245.0,Grocery,February
6,180.0,Household,February
4,115.0,Transportation,February
7,90.0,Entertainment,February
1,235.0,Grocery,January
2,175.0,Household,January
3,100.0,Entertainment,January
0,74.0,Transportation,January
11,300.0,Entertainment,Marth
10,273.0,Household,Marth


In [9]:
data = pd.DataFrame({'k1':[1,2,3,5,5,4,6,2,2], 'k2':[12,8,11,4,4,52,30,18,18]})
data

Unnamed: 0,k1,k2
0,1,12
1,2,8
2,3,11
3,5,4
4,5,4
5,4,52
6,6,30
7,2,18
8,2,18


In [10]:
data.sort_values(by='k2')

Unnamed: 0,k1,k2
3,5,4
4,5,4
1,2,8
2,3,11
0,1,12
7,2,18
8,2,18
6,6,30
5,4,52


In [12]:
data.drop_duplicates()    # 去除完全一样的数据

Unnamed: 0,k1,k2
0,1,12
1,2,8
2,3,11
3,5,4
5,4,52
6,6,30
7,2,18


In [16]:
data.drop_duplicates(subset=['k1'])    # 只要 k1 属性的值一样 就去除

Unnamed: 0,k1,k2
0,1,12
1,2,8
2,3,11
3,5,4
5,4,52
6,6,30


In [17]:
data2 = pd.DataFrame({'k1':[1,5,3,16,19], 'k2':['A1','A2', 'A1', 'B2', 'A2']})
data2

Unnamed: 0,k1,k2
0,1,A1
1,5,A2
2,3,A1
3,16,B2
4,19,A2


##### 如果希望将 A1和A2 都归为A类 

In [19]:
def map(series):
    if series['k2'] == 'A1':
        return 'A'
    elif series['k2'] == 'A2':
        return 'A'

In [22]:
data2['k2_map'] = data2.apply(map, axis=1)
data2

Unnamed: 0,k1,k2,k2_map
0,1,A1,A
1,5,A2,A
2,3,A1,A
3,16,B2,
4,19,A2,A


In [24]:
df = pd.DataFrame({'t1':np.random.randn(5), 't2':np.random.randn(5)})
df

Unnamed: 0,t1,t2
0,0.909904,-0.394096
1,0.943165,0.480281
2,-1.616222,2.153146
3,-0.403424,-1.16006
4,0.850074,1.280206


In [34]:
df2 = df.assign(ration = df['t1'] / df['t2'])    # 新插入一列属性 ration 值为 t1的值/t2的值

In [35]:
df2

Unnamed: 0,t1,t2,ration
0,0.909904,-0.394096,-2.30884
1,0.943165,0.480281,1.963779
2,-1.616222,2.153146,-0.750633
3,-0.403424,-1.16006,0.347762
4,0.850074,1.280206,0.664013


In [37]:
df2.drop('ration', axis=1)     # 删除 ration 这一列

Unnamed: 0,t1,t2
0,0.909904,-0.394096
1,0.943165,0.480281
2,-1.616222,2.153146
3,-0.403424,-1.16006
4,0.850074,1.280206


##### 将数据按指定区间切分

In [45]:
age = pd.Series([18, 9, 6, 20, 26, 67, 52, 38, 40, 81, 36, 74, 88])
bins = [10, 40, 60, 90]     # 将数据切分为 10-40 40-60 60-90 的区间
res = pd.cut(age, bins)
res

0     (10.0, 40.0]
1              NaN
2              NaN
3     (10.0, 40.0]
4     (10.0, 40.0]
5     (60.0, 90.0]
6     (40.0, 60.0]
7     (10.0, 40.0]
8     (10.0, 40.0]
9     (60.0, 90.0]
10    (10.0, 40.0]
11    (60.0, 90.0]
12    (60.0, 90.0]
dtype: category
Categories (3, interval[int64, right]): [(10, 40] < (40, 60] < (60, 90]]

#### 有的数据不在区间范围内就被设置为 NaN 

In [46]:
pd.value_counts(res)     # 对每个区间进行计数

(10, 40]    6
(60, 90]    4
(40, 60]    1
dtype: int64

In [49]:
w = pd.cut(age, bins, labels=['youth', 'mid', 'old'])     # 给每个区间设置一个名称 数量应与区间数一致
pd.value_counts(w)

youth    6
old      4
mid      1
dtype: int64

In [51]:
res.isnull()      # isnull() 方法判断每一个值是否为NAN   DataFrame也是一样的方法

0     False
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
dtype: bool

In [58]:
qw = pd.Series([np.nan, 1, 2, 3])  
qw

0    NaN
1    1.0
2    2.0
3    3.0
dtype: float64

In [61]:
qw.fillna(5)   # 用一个指定值 填充所有 NaN

0    5.0
1    1.0
2    2.0
3    3.0
dtype: float64