In [1]:
#Data Aggregation and Group Operations
#In this chapter, we will learn how to:
# *Split an object into pieces using one or more keys
# *Calculate group summary statistics, like count, mean, or standard deviation
# *Apply within-group transformations or other manipulations, like normalization, linear regression, etc.
# *Compute pivot tables and cross-tabulations
# *Perform quantile analysis and other statistical group analysis

In [2]:
In [12]: import numpy as np
#NumPy libary: is an open-source Python library that facilitates efficient numerical operations on large quantities of data
In [13]: import pandas as pd
#pandas: DataFrames are at the center of pandas. A DataFrame is structured like a table or spreedsheet    

In [3]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                      dtype="Int64"),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-1.16942,0.181476
1,a,2.0,2.335429,-0.832264
2,,1.0,-0.265397,-0.383448
3,b,2.0,0.818855,0.892703
4,b,1.0,1.383261,0.577347
5,a,,-1.411944,0.602524
6,,1.0,1.658565,0.150352


In [4]:
#Assume you wanted to compute the mean of the data1 column using the labels from key1
grouped = df["data1"].groupby(df["key1"])
grouped.mean()

key1
a   -0.081978
b    1.101058
Name: data1, dtype: float64

In [5]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()

In [6]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.16942,2.335429
b,1.383261,0.818855


In [7]:
#But what about mean of two dataset?
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.081978,-0.016088
b,1.5,1.101058,0.735025


In [8]:
#It is necessary to pass numeric_only=True because the key1 column is not numeric and cannot be aggregated with mean()
#You can see the error if you disable numeric_only=True in the code.
df.groupby("key2").mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.401752,0.131432
2,1.577142,0.03022


In [9]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-1.16942,0.181476
a,2,2.335429,-0.832264
b,1,1.383261,0.577347
b,2,0.818855,0.892703


In [10]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [11]:
#any missing values in a group key are excluded from the result.
df.groupby("key1", dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [12]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -1.169420  0.181476
1    a     2  2.335429 -0.832264
5    a  <NA> -1.411944  0.602524
b
  key1  key2     data1     data2
3    b     2  0.818855  0.892703
4    b     1  1.383261  0.577347


In [13]:
#If you want specific pieces of data
pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,0.818855,0.892703
4,b,1,1.383261,0.577347


In [14]:
#Indexing
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.181476
a,2,-0.832264
b,1,0.577347
b,2,0.892703


In [15]:
#Grouping with Dictionaries and Series
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,-0.770331,0.813436,0.549662,0.623746,-0.246034
Steve,-1.479849,0.115496,0.056105,-0.480154,1.650148
Wanda,1.0949,,,0.061029,0.252436
Jill,-2.158607,-0.584336,0.022208,-0.288802,-1.387548
Trey,0.277859,-0.66353,0.291919,0.043135,0.197305


In [16]:
mapping = {"a": "red", "b": "red", "c": "blue",
           "d": "blue", "e": "red", "f" : "orange"}

In [17]:
by_column = people.groupby(mapping, axis="columns")
by_column.sum()

Unnamed: 0,blue,red
Joe,1.173408,-0.202929
Steve,-0.424049,0.285795
Wanda,0.061029,1.347336
Jill,-0.266594,-4.130491
Trey,0.335054,-0.188365


In [18]:
#10.2 Data Aggregation
#aggregations refer to any data transformation that produces scalar values from arrays.
#Example: mean, count, min, and sum.
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-1.16942,0.181476
1,a,2.0,2.335429,-0.832264
2,,1.0,-0.265397,-0.383448
3,b,2.0,0.818855,0.892703
4,b,1.0,1.383261,0.577347
5,a,,-1.411944,0.602524
6,,1.0,1.658565,0.150352


In [19]:
grouped = df.groupby("key1")
grouped["data1"].nsmallest(2)

key1   
a     5   -1.411944
      0   -1.169420
b     3    0.818855
      4    1.383261
Name: data1, dtype: float64

In [20]:
#We can use your own aggregation functions
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3.747374,1.434788
b,1,0.564406,0.315357


In [21]:
#Column-Wise and Multiple Function Application
#Lets first open excel file
tips = pd.read_csv("examples/tips.csv")
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [22]:
#We can add tip percentage
tips["tip_pct"] = tips["tip"] / tips["total_bill"]
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [23]:
grouped = tips.groupby(["day", "smoker"])
grouped_pct = grouped["tip_pct"]
grouped_pct.agg("mean")

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [24]:
grouped_pct.agg(["mean", "std", peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [25]:
grouped_pct.agg([("average", "mean"), ("stdev", np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,average,stdev
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [26]:
functions = ["count", "mean", "max"]
result = grouped[["tip_pct", "total_bill"]].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [27]:
#For different functions to one or more of the columns
grouped.agg({"tip" : np.max, "size" : "sum"})
grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],
             "size" : "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [28]:
#Filling Missing Values with Group-Specific Values
s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s

0         NaN
1   -1.273821
2         NaN
3   -1.064013
4         NaN
5    1.456697
dtype: float64

In [29]:
s.fillna(s.mean())

0   -0.293712
1   -1.273821
2   -0.293712
3   -1.064013
4   -0.293712
5    1.456697
dtype: float64

In [30]:
df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df

Unnamed: 0,category,data,weights
0,a,-0.434482,0.19565
1,a,0.585765,0.159649
2,a,-0.499014,0.009022
3,a,-0.156974,0.875609
4,b,-0.472012,0.133679
5,b,1.105003,0.744115
6,b,0.062111,0.509137
7,b,0.841692,0.135081


In [31]:
grouped = df.groupby("category")
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

grouped.apply(get_wavg)

category
a   -0.107619
b    0.594260
dtype: float64

In [32]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [33]:
g = df.groupby('key')['value']
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [34]:
def times_two(group):
    return group * 2
g.transform(times_two)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64