In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(
    {
        'key_1':['x','x','y','y','z'],
        'key_2':['alpha','beta','alpha','beta','alpha'],
        'dataset_1': np.random.randn(5),
        'dataset_2': np.random.randn(5)
    }
)

df

Unnamed: 0,dataset_1,dataset_2,key_1,key_2
0,-1.500595,0.191367,x,alpha
1,1.731362,-0.570561,x,beta
2,0.201571,-0.440455,y,alpha
3,0.407063,-1.30913,y,beta
4,0.669476,0.038749,z,alpha


In [9]:
"""
Agrupar o df pelos dados de key_1
"""

group1 = df.groupby(['key_1'])
"""
Não conseguimos ver o que é group1 pq 
temos que realizar operações sobre ele
"""
group1

<pandas.core.groupby.DataFrameGroupBy object at 0x7f0f1fccc128>

In [11]:
group1.mean()

Unnamed: 0_level_0,dataset_1,dataset_2
key_1,Unnamed: 1_level_1,Unnamed: 2_level_1
x,0.115384,-0.189597
y,0.304317,-0.874793
z,0.669476,0.038749


In [19]:
cities = np.array(['SP','DF','DF','SP','SP'])

month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [22]:
"""
Vamos realizar um groupby com os arrays acima e o 'df'. 
"""

df.groupby([cities,month]).mean()

Unnamed: 0,Unnamed: 1,dataset_1,dataset_2
DF,FEB,1.731362,-0.570561
DF,JAN,0.201571,-0.440455
SP,FEB,0.407063,-1.30913
SP,JAN,-0.415559,0.115058


In [27]:
df

Unnamed: 0,dataset_1,dataset_2,key_1,key_2
0,-1.500595,0.191367,x,alpha
1,1.731362,-0.570561,x,beta
2,0.201571,-0.440455,y,alpha
3,0.407063,-1.30913,y,beta
4,0.669476,0.038749,z,alpha


In [24]:
"""
Podemos passar os nomes das colunas como argumento pro groupby.
Ou uma lista de colunas. 
"""

df.groupby('key_1').mean()



Unnamed: 0_level_0,dataset_1,dataset_2
key_1,Unnamed: 1_level_1,Unnamed: 2_level_1
x,0.115384,-0.189597
y,0.304317,-0.874793
z,0.669476,0.038749


In [28]:
"""
Lista de colunas
"""

df.groupby(['key_1','key_2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_1,dataset_2
key_1,key_2,Unnamed: 2_level_1,Unnamed: 3_level_1
x,alpha,-1.500595,0.191367
x,beta,1.731362,-0.570561
y,alpha,0.201571,-0.440455
y,beta,0.407063,-1.30913
z,alpha,0.669476,0.038749


In [30]:
"""
Outros metodos de Groupby
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
"""

df.groupby(['key_1','key_2']).size()

key_1  key_2
x      alpha    1
       beta     1
y      alpha    1
       beta     1
z      alpha    1
dtype: int64

In [34]:
"""
Podemos percorrer o retorno de um groupby. 

Note que nao precisou de colocar um .method (sum,agg,mean,etc) 
depois do groupby.
"""

for name,group in df.groupby('key_1'):
    
    """
    name é o valor da key_1 nessa iteracao
    group são os outros valores agrupados/relacionados com a chave key_1
    """
    
    print("This is the {} group".format(name))
    print(group,'\n')

This is the x group
   dataset_1  dataset_2 key_1  key_2
0  -1.500595   0.191367     x  alpha
1   1.731362  -0.570561     x   beta 

This is the y group
   dataset_1  dataset_2 key_1  key_2
2   0.201571  -0.440455     y  alpha
3   0.407063  -1.309130     y   beta 

This is the z group
   dataset_1  dataset_2 key_1  key_2
4   0.669476   0.038749     z  alpha 



In [36]:
"""
Iterando sobre multiplas keys. 
"""
for (key1,key2), group in df.groupby(['key_1','key_2']):
    print('Key1 = {}, Key2 = {}'.format(key1,key2))
    print(group,'\n')

Key1 = x, Key2 = alpha
   dataset_1  dataset_2 key_1  key_2
0  -1.500595   0.191367     x  alpha 

Key1 = x, Key2 = beta
   dataset_1  dataset_2 key_1 key_2
1   1.731362  -0.570561     x  beta 

Key1 = y, Key2 = alpha
   dataset_1  dataset_2 key_1  key_2
2   0.201571  -0.440455     y  alpha 

Key1 = y, Key2 = beta
   dataset_1  dataset_2 key_1 key_2
3   0.407063   -1.30913     y  beta 

Key1 = z, Key2 = alpha
   dataset_1  dataset_2 key_1  key_2
4   0.669476   0.038749     z  alpha 



In [40]:
"""
Pode-se criar um dicionario de dados agrupados. 
"""

g_dict = dict(list(df.groupby('key_1')))
g_dict

{'x':    dataset_1  dataset_2 key_1  key_2
 0  -1.500595   0.191367     x  alpha
 1   1.731362  -0.570561     x   beta,
 'y':    dataset_1  dataset_2 key_1  key_2
 2   0.201571  -0.440455     y  alpha
 3   0.407063  -1.309130     y   beta,
 'z':    dataset_1  dataset_2 key_1  key_2
 4   0.669476   0.038749     z  alpha}

In [42]:
g_dict['x']

Unnamed: 0,dataset_1,dataset_2,key_1,key_2
0,-1.500595,0.191367,x,alpha
1,1.731362,-0.570561,x,beta


In [45]:
"""
Podemos agrupar o df de diversas formas. 

Abaixo é um exemplo de groupby usando os types que estão no DataFrame
"""
g_axis_dict = dict(list(df.groupby(df.dtypes,axis=1)))
g_axis_dict

{dtype('float64'):    dataset_1  dataset_2
 0  -1.500595   0.191367
 1   1.731362  -0.570561
 2   0.201571  -0.440455
 3   0.407063  -1.309130
 4   0.669476   0.038749, dtype('O'):   key_1  key_2
 0     x  alpha
 1     x   beta
 2     y  alpha
 3     y   beta
 4     z  alpha}

In [52]:
"""
Agrupar os valores do 'df' com base em key_1,key_2 e nos valores da coluna 
dataset_2
"""

dataset2_group = df.groupby(['key_1','key_2'])['dataset_2']

"""
Aqui podemos realizar as diversas operações permitidas no DataFrame
"""
dataset2_group.size()

key_1  key_2
x      alpha    1
       beta     1
y      alpha    1
       beta     1
z      alpha    1
Name: dataset_2, dtype: int64

In [49]:
df

Unnamed: 0,dataset_1,dataset_2,key_1,key_2
0,-1.500595,0.191367,x,alpha
1,1.731362,-0.570561,x,beta
2,0.201571,-0.440455,y,alpha
3,0.407063,-1.30913,y,beta
4,0.669476,0.038749,z,alpha
