# Manipulación de DataFrames

## Aggregate

Agregar datos por columnas con funciones estándar de Python.

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
    'key': ['A', 'B', 'C', 'A', 'B', 'C'],
    'value1': range(6),
    'value2': np.random.randint(0, 100, 6)
})

In [4]:
df

Unnamed: 0,key,value1,value2
0,A,0,62
1,B,1,0
2,C,2,7
3,A,3,15
4,B,4,16
5,C,5,51


In [5]:
df.groupby('key').agg([min, np.median, np.mean, 'max'])

Unnamed: 0_level_0,value1,value1,value1,value1,value2,value2,value2,value2
Unnamed: 0_level_1,min,median,mean,max,min,median,mean,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,0,1.5,1.5,3,15,38.5,38.5,62
B,1,2.5,2.5,4,0,8.0,8.0,16
C,2,3.5,3.5,5,7,29.0,29.0,51


In [11]:
df.groupby('key').agg({
    'value1': min,
    'value2': max
})

Unnamed: 0_level_0,value1,value2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,62
B,1,16
C,2,51


## Filter

Filtrar los datos con una función booleana.

In [14]:
df.groupby('key').std()

Unnamed: 0_level_0,value1,value2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,33.234019
B,2.12132,11.313708
C,2.12132,31.112698


In [16]:
def filter_func(x):
    return x['value2'].std() > 20

In [19]:
display(df, df.groupby("key").std(), df.groupby("key").filter(filter_func))

Unnamed: 0,key,value1,value2
0,A,0,62
1,B,1,0
2,C,2,7
3,A,3,15
4,B,4,16
5,C,5,51


Unnamed: 0_level_0,value1,value2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,33.234019
B,2.12132,11.313708
C,2.12132,31.112698


Unnamed: 0,key,value1,value2
0,A,0,62
2,C,2,7
3,A,3,15
5,C,5,51


## Transform

Cambia columnas en base a una función.

In [20]:
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,value1,value2
0,-1.5,23.5
1,-1.5,-8.0
2,-1.5,-22.0
3,1.5,-23.5
4,1.5,8.0
5,1.5,22.0


## Apply

Agregados entre diferentes columnas.

In [26]:
def norm_by_col2(x):
    x['value1'] /= x['value2'].sum()
    return x

In [27]:
df.groupby('key').apply(norm_by_col2)

Unnamed: 0,key,value1,value2
0,A,0.0,62
1,B,0.0625,0
2,C,0.034483,7
3,A,0.038961,15
4,B,0.25,16
5,C,0.086207,51
