## Lecture 19 - Grouped operations

**1. Grouped operations**

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
# Sample data
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1]),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
display(df)

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.690881,0.881848
1,a,2.0,-0.176521,0.680173
2,,1.0,-0.620245,0.030918
3,b,2.0,0.815891,1.968712
4,b,1.0,-0.354707,1.167105
5,a,,0.380403,0.695112
6,,1.0,-1.132238,-0.178689


In [3]:
#df[df["key1"] == "a"].data1.mean()
# Boolean indexing (slow way)
print(f'a: {df[df["key1"] == "a"].data1.mean()}')
print(f'b: {df[df["key1"] == "b"].data1.mean()}')

a: 0.6315875871986562
b: 0.23059221464487695


In [4]:
display(df)

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.690881,0.881848
1,a,2.0,-0.176521,0.680173
2,,1.0,-0.620245,0.030918
3,b,2.0,0.815891,1.968712
4,b,1.0,-0.354707,1.167105
5,a,,0.380403,0.695112
6,,1.0,-1.132238,-0.178689


In [5]:
# Create grouped object
grouped = df["data1"].groupby([df["key1"]])
# Get type
type(grouped)

pandas.core.groupby.generic.SeriesGroupBy

In [6]:
# Perform operation on grouped data
d = grouped.mean()
print(d)
print(type(d))

key1
a    0.631588
b    0.230592
Name: data1, dtype: float64
<class 'pandas.core.series.Series'>


In [7]:
# What type of object did we get?
type(grouped.mean())

pandas.core.series.Series

In [8]:
# Grouping on two columns (keys)
d = df.data1.groupby([df.key1, df.key2]).mean()
display(d)

key1  key2
a     1.0     1.690881
      2.0    -0.176521
b     1.0    -0.354707
      2.0     0.815891
Name: data1, dtype: float64

In [9]:
# Hierarchical indexing
display(d.index)

MultiIndex([('a', 1.0),
            ('a', 2.0),
            ('b', 1.0),
            ('b', 2.0)],
           names=['key1', 'key2'])

In [10]:
# Accessing elements in a multi-index
print(d["a"],"\n----")
print(d["a"][1])

key2
1.0    1.690881
2.0   -0.176521
Name: data1, dtype: float64 
----
1.690881362052117


**2. Hierarchical indexing**

In [11]:
data = pd.Series(np.random.uniform(size=9),
                 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
display(data)


a  1    0.567229
   2    0.062451
   3    0.653097
b  1    0.360288
   3    0.923109
c  1    0.080607
   2    0.740872
d  2    0.365103
   3    0.086064
dtype: float64

In [12]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [13]:
# Partial indexing
#data["a"]
display(data.a)

1    0.567229
2    0.062451
3    0.653097
dtype: float64

In [14]:
# Accssing data
data["a":"b"]

a  1    0.567229
   2    0.062451
   3    0.653097
b  1    0.360288
   3    0.923109
dtype: float64

In [15]:
# Accessing data
data.loc[["b","d"]]

b  1    0.360288
   3    0.923109
d  2    0.365103
   3    0.086064
dtype: float64

In [16]:
# Inner indexing
data.loc[:, 2]

a    0.062451
c    0.740872
d    0.365103
dtype: float64

In [17]:
# Unstack multi-index into a DF
data.unstack()

Unnamed: 0,1,2,3
a,0.567229,0.062451,0.653097
b,0.360288,,0.923109
c,0.080607,0.740872,
d,,0.365103,0.086064


In [18]:
# Reshape into a multi-index
data.unstack().stack()

a  1    0.567229
   2    0.062451
   3    0.653097
b  1    0.360288
   3    0.923109
c  1    0.080607
   2    0.740872
d  2    0.365103
   3    0.086064
dtype: float64

In [19]:
display(df)
# Unstacking our earlier df
display(df["data1"].groupby([df["key1"], df["key2"]]).mean().unstack())

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.690881,0.881848
1,a,2.0,-0.176521,0.680173
2,,1.0,-0.620245,0.030918
3,b,2.0,0.815891,1.968712
4,b,1.0,-0.354707,1.167105
5,a,,0.380403,0.695112
6,,1.0,-1.132238,-0.178689


key2,1.0,2.0
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.690881,-0.176521
b,-0.354707,0.815891


In [20]:
display(data)
display(data.loc["a":"d"][2])

a  1    0.567229
   2    0.062451
   3    0.653097
b  1    0.360288
   3    0.923109
c  1    0.080607
   2    0.740872
d  2    0.365103
   3    0.086064
dtype: float64

0.6530965072339584