# 分组级运算和转换

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})

df

Unnamed: 0,data1,data2,key1,key2
0,1.317238,-0.976134,a,one
1,3.108932,-0.280863,a,two
2,-0.334978,0.917459,b,one
3,0.198655,-0.665597,b,two
4,-0.877245,-0.076443,a,one


In [3]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')

k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.182975,-0.44448
b,-0.068161,0.125931


In [4]:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,1.317238,-0.976134,a,one,1.182975,-0.44448
1,3.108932,-0.280863,a,two,1.182975,-0.44448
4,-0.877245,-0.076443,a,one,1.182975,-0.44448
2,-0.334978,0.917459,b,one,-0.068161,0.125931
3,0.198655,-0.665597,b,two,-0.068161,0.125931


In [5]:
key = ['one', 'two', 'one', 'two', 'one']

key

['one', 'two', 'one', 'two', 'one']

## 示例：用特定于分组的值填充缺失值

In [3]:
s = pd.Series(np.random.randn(6))

s

0   -0.724027
1   -1.248180
2    2.136260
3    0.253648
4   -0.376440
5   -0.636931
dtype: float64

In [5]:
s[::2] = np.nan

s

0         NaN
1   -1.248180
2         NaN
3    0.253648
4         NaN
5   -0.636931
dtype: float64

In [6]:
s.fillna(s.mean())

0   -0.543821
1   -1.248180
2   -0.543821
3    0.253648
4   -0.543821
5   -0.636931
dtype: float64

In [7]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']

states

['Ohio',
 'New York',
 'Vermont',
 'Florida',
 'Oregon',
 'Nevada',
 'California',
 'Idaho']

In [8]:
group_key = ['East'] * 4 + ['West'] * 4

group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [9]:
data = pd.Series(np.random.randn(8), index=states)

data

Ohio          0.684346
New York      0.529430
Vermont      -0.420308
Florida       0.344085
Oregon        0.025553
Nevada        0.519027
California   -0.626964
Idaho        -0.671533
dtype: float64

In [10]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

data

Ohio          0.684346
New York      0.529430
Vermont            NaN
Florida       0.344085
Oregon        0.025553
Nevada             NaN
California   -0.626964
Idaho              NaN
dtype: float64

In [11]:
data.groupby(group_key).mean()

East    0.519287
West   -0.300706
dtype: float64

In [12]:
fill_mean = lambda g: g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)

Ohio          0.684346
New York      0.529430
Vermont       0.519287
Florida       0.344085
Oregon        0.025553
Nevada       -0.300706
California   -0.626964
Idaho        -0.300706
dtype: float64

In [13]:
fill_values = {'East': 0.5, 'West': -1}

fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Ohio          0.684346
New York      0.529430
Vermont       0.500000
Florida       0.344085
Oregon        0.025553
Nevada       -1.000000
California   -0.626964
Idaho        -1.000000
dtype: float64

## 示例：随机采样和排列

In [14]:
# 红桃（Hearts），黑桃（Spades）， 梅花（Clubs）， 方片（Diamonds）
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'Q', 'K']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
    
deck = pd.Series(card_val, index=cards)

In [16]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
QH     10
KH     10
dtype: int64

In [17]:
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])

In [18]:
draw(deck)

KD    10
AH     1
6C     6
KH    10
KS    10
dtype: int64

In [19]:
get_suit = lambda card: card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  10C    10
   8C      8
D  JD     10
   5D      5
H  2H      2
   4H      4
S  10S    10
   KS     10
dtype: int64

In [20]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

KC    10
QC    10
3D     3
8D     8
QH    10
2H     2
2S     2
6S     6
dtype: int64