# Grouping

Pandas `groupby` works with many reducing functions, including: `count`, `sum`, `min`, `max`, `std`, etc.

In [1]:
import pandas as pd
import numpy as np

medalists = pd.read_csv('./data/all_medalist.csv')
print(medalists.shape)
medalists.head()

(29216, 10)


Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [2]:
# total number of medals awarded at each games
medalists.groupby('Edition').Medal.count()

Edition
1896     151
1900     512
1904     470
1908     804
1912     885
1920    1298
1924     884
1928     710
1932     615
1936     875
1948     814
1952     889
1956     885
1960     882
1964    1010
1968    1031
1972    1185
1976    1305
1980    1387
1984    1459
1988    1546
1992    1705
1996    1859
2000    2015
2004    1998
2008    2042
Name: Medal, dtype: int64

In [3]:
# medals awarded by gender and type
medalists.groupby(['Edition', 'Gender', 'Medal']).Medal.count()[:10]

Edition  Gender  Medal 
1896     Men     Bronze     40
                 Gold       64
                 Silver     47
1900     Men     Bronze    137
                 Gold      175
                 Silver    189
         Women   Bronze      5
                 Gold        3
                 Silver      3
1904     Men     Bronze    121
Name: Medal, dtype: int64

In [4]:
# medals awarded by sport, gender and type
medalists.groupby(['Sport', 'Gender', 'Medal']).Medal.count()[:12]

Sport     Gender  Medal 
Aquatics  Men     Bronze    754
                  Gold      778
                  Silver    757
          Women   Bronze    502
                  Gold      524
                  Silver    513
Archery   Men     Bronze     51
                  Gold       83
                  Silver     72
          Women   Bronze     31
                  Gold       35
                  Silver     33
Name: Medal, dtype: int64

In [5]:
stocks = pd.read_csv('./data/ticker.csv')
stocks

Unnamed: 0,Ticker,Date,High,Low,Open,Close,Volume,Adj Close
0,AAPL,2000-01-03,4.017857,3.631696,3.745536,3.997768,133949200.0,2.665724
1,AMZN,2000-01-04,3.950893,3.613839,3.866071,3.660714,128094400.0,2.440975
2,AIG,2000-01-05,3.948661,3.678571,3.705357,3.714286,194580400.0,2.476697
3,MSFT,2000-01-06,3.821429,3.392857,3.790179,3.392857,191993200.0,2.262367
4,UBR,2000-01-07,3.607143,3.410714,3.446429,3.553571,115183600.0,2.369532
5,GEC,2000-01-10,3.651786,3.383929,3.642857,3.491071,126266000.0,2.327857
6,GAP,2000-01-03,16.160431,15.599305,15.823756,15.711531,10635000.0,6.698944
7,FAB,2000-01-04,15.599305,15.150405,15.459024,15.26263,10734600.0,6.507545
8,ALPH,2000-01-05,15.402911,15.066236,15.066236,15.234573,11722500.0,6.495581
9,ZIX,2000-01-06,15.823756,15.178461,15.26263,15.767643,17479500.0,6.72287


In [6]:
# determine the average metrics for each day
stocks.groupby('Date')[['High', 'Low', 'Volume']].mean()

Unnamed: 0_level_0,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-03,10.089144,9.615501,72292100.0
2000-01-04,9.775099,9.382122,69414500.0
2000-01-05,9.675786,9.372404,103151450.0
2000-01-06,9.822592,9.285659,104736350.0
2000-01-07,3.607143,3.410714,115183600.0
2000-01-10,3.651786,3.383929,126266000.0


In [7]:
gapminder = pd.read_csv('./data/gapminder_tidy.csv')
gapminder.head()

Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [8]:
gapminder.groupby(['region', 'Country']).population.sum()

region              Country              
America             Antigua and Barbuda      3.592734e+06
                    Argentina                1.591689e+09
                    Aruba                    3.759294e+06
                    Bahamas                  1.242157e+07
                    Barbados                 1.282605e+07
                    Belize                   9.862290e+06
                    Bolivia                  3.346750e+08
                    Brazil                   7.187255e+09
                    Canada                   1.360364e+09
                    Chile                    6.511178e+08
                    Colombia                 1.628126e+09
                    Costa Rica               1.529606e+08
                    Cuba                     5.086253e+08
                    Dominican Rep.           3.502753e+08
                    Ecuador                  4.950485e+08
                    El Salvador              2.537686e+08
                    French Gui

In [9]:
life = pd.read_csv('./data/life_expectancy.csv', index_col='Country')
life.head()

Unnamed: 0_level_0,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,33.639,34.152,34.662,35.17,35.674,36.172,36.663,37.143,37.614,38.075,...,56.583,57.071,57.582,58.102,58.618,59.124,59.612,60.079,60.524,60.947
Albania,65.475,65.863,66.122,66.316,66.5,66.702,66.948,67.251,67.595,67.966,...,75.725,75.949,76.124,76.278,76.433,76.598,76.78,76.979,77.185,77.392
Algeria,47.953,48.389,48.806,49.205,49.592,49.976,50.366,50.767,51.195,51.67,...,69.682,69.854,70.02,70.18,70.332,70.477,70.615,70.747,70.874,71.0
Angola,34.604,35.007,35.41,35.816,36.222,36.627,37.032,37.439,37.846,38.247,...,48.036,48.572,49.041,49.471,49.882,50.286,50.689,51.094,51.498,51.899
Antigua and Barbuda,63.775,64.149,64.511,64.865,65.213,65.558,65.898,66.232,66.558,66.875,...,74.355,74.544,74.729,74.91,75.087,75.263,75.437,75.61,75.783,75.954


In [10]:
regions = pd.read_csv('./data/regions.csv', index_col='Country')
regions.head()

Unnamed: 0_level_0,region
Country,Unnamed: 1_level_1
Afghanistan,South Asia
Albania,Europe & Central Asia
Algeria,Middle East & North Africa
Angola,Sub-Saharan Africa
Antigua and Barbuda,America


By setting the index of both DataFrames to the country name, we'll then use the region information to group the countries in the life expectancy DataFrame and compute the mean value for 2010.

In [11]:
life.groupby(regions['region'])['2010'].mean()

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64

We can execute multiple aggregations simultaneously by employing the `agg` function and passing it a list or tuple of the string labels for the particular methods to be executed. The `agg` function can also be passed custom or library functions (custom functions need to take a series and return a series).

In [12]:
life.groupby(regions['region'])['2010', '2011'].agg(['min', 'max', 'mean', 'std'])

Unnamed: 0_level_0,2010,2010,2010,2010,2011,2011,2011,2011
Unnamed: 0_level_1,min,max,mean,std,min,max,mean,std
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
America,45.0,80.986,74.03735,5.891268,60.0,81.157,74.615275,4.256718
East Asia & Pacific,61.994,83.092,73.40575,5.953203,62.15,83.255,73.628875,5.915894
Europe & Central Asia,65.0,82.192,75.656387,5.013072,65.147,82.336,75.813836,5.026381
Middle East & North Africa,60.307,81.295,72.805333,5.098697,60.819,81.473,72.994333,5.04559
South Asia,59.612,76.779,68.18975,5.259232,60.079,77.188,68.53325,5.229722
Sub-Saharan Africa,44.846,78.879,57.57508,7.881809,45.11,79.141,57.99406,7.868391


In [13]:
def data_range(series):
    return series.max() - series.min()

In [14]:
life.groupby(regions['region'])['2010', '2011'].agg(data_range)

Unnamed: 0_level_0,2010,2011
region,Unnamed: 1_level_1,Unnamed: 2_level_1
America,35.986,21.157
East Asia & Pacific,21.098,21.105
Europe & Central Asia,17.192,17.189
Middle East & North Africa,20.988,20.654
South Asia,17.167,17.109
Sub-Saharan Africa,34.033,34.031


The `agg` function supports applying a specifg aggregation/function to a specific column by employing a python dict.

In [15]:
my_dict = {'2010': 'mean', '2011': 'min', '2012': data_range}

In [16]:
life.groupby(regions['region'])['2010', '2011'].agg(my_dict)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,2010,2011,2012
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
America,74.03735,60.0,18.577
East Asia & Pacific,73.40575,62.15,21.13
Europe & Central Asia,75.656387,65.147,17.172
Middle East & North Africa,72.805333,60.819,20.322
South Asia,68.18975,60.079,17.04
Sub-Saharan Africa,57.57508,45.11,34.059


In [17]:
titanic = pd.read_csv('./data/train.csv', index_col='PassengerId')

aggregated = titanic.groupby('Pclass')[['Age', 'Fare']].agg(('max', 'median'))
aggregated

Unnamed: 0_level_0,Age,Age,Fare,Fare
Unnamed: 0_level_1,max,median,max,median
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,80.0,37.0,512.3292,60.2875
2,70.0,29.0,73.5,14.25
3,74.0,24.0,69.55,8.05


In [18]:
# Print the maximum age in each class
print(aggregated.loc[:, ('Age','max')])

# Print the median fare in each class
print(aggregated.loc[:, ('Fare', 'median')])

Pclass
1    80.0
2    70.0
3    74.0
Name: (Age, max), dtype: float64
Pclass
1    60.2875
2    14.2500
3     8.0500
Name: (Fare, median), dtype: float64


If you have a DataFrame with a multi-level row index, the individual levels can be used to perform the groupby. This allows advanced aggregation techniques to be applied along one or more levels in the index and across one or more columns.

Using the **Gapminder** dataset, create a multi-level DataFrame of the columns `Year`, `Region` and `Country`. Next you'll group the DataFrame by the `Year` and `Region` levels. Finally, apply a dictionary aggregation to compute the total population, spread of per capita GDP values and average child mortality rate.

In [19]:
gapminder = pd.read_csv(
    './data/gapminder_tidy.csv', 
    index_col=['Year', 'region', 'Country']
).sort_index()

gapminder.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fertility,life,population,child_mortality,gdp
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964,America,Antigua and Barbuda,4.25,63.775,58653.0,72.78,5008.0
1964,America,Argentina,3.068,65.388,21966478.0,57.43,8227.0
1964,America,Aruba,4.059,67.113,57031.0,,5505.0
1964,America,Bahamas,4.22,64.189,133709.0,48.56,18160.0
1964,America,Barbados,4.094,62.819,234455.0,64.7,5681.0


In [20]:
# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year', 'region'])

In [21]:
# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

In [22]:
# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
aggregated.tail(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,child_mortality,gdp
Year,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,America,962908700.0,17.745833,49634.0
2013,East Asia & Pacific,2244209000.0,22.285714,134744.0
2013,Europe & Central Asia,896878800.0,9.831875,86418.0
2013,Middle East & North Africa,403050400.0,20.2215,128676.0
2013,South Asia,1701241000.0,46.2875,11469.0
2013,Sub-Saharan Africa,920599600.0,76.94449,32035.0


Groupby operations can also be performed on transformations of the index values. In the case of a `DateTimeIndex`, we can extract portions of the datetime over which to group.

We'll read in a set of sample sales data from February 2015 and assign the `Date` column as the index. Group the sales data by the day of the week and aggregate the sum of the `Units` column.

Is there a day of the week that is more popular for customers? To find out, we're going to use `.strftime('%a')` to transform the index datetime values to abbreviated days of the week.

In [23]:
sales = pd.read_csv('./data/sales3.csv', index_col='Date', parse_dates=True)
sales.head()

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


In [24]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19 entries, 2015-02-02 08:30:00 to 2015-02-26 09:00:00
Data columns (total 3 columns):
Company    19 non-null object
Product    19 non-null object
Units      19 non-null int64
dtypes: int64(1), object(2)
memory usage: 608.0+ bytes


In [25]:
sales.groupby(sales.index.strftime('%a'))['Units'].agg('sum')

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64

## Detecting Outliers using zscore

The `zscore` of a value is it's distance from the mean of it's population in units of standard deviation.  It is useful to find outliers: a z-score value of +/- 3 is generally considered to be an outlier.

In [26]:
def zscore(series):
    return (series - series.mean()) / series.std()

In [27]:
auto = pd.read_csv('./data/auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,US,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,US,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,US,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,US,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,US,ford torino


In [28]:
zscore(auto['mpg']).head()

0   -0.697747
1   -1.082115
2   -0.697747
3   -0.953992
4   -0.825870
Name: mpg, dtype: float64

We can see from the 2nd row that the 'buick skylark 320' was more than one standard deviation below the mean for `mpg` for all the cars listed.

To calculate the zscore for `mpg` by year, use `groupby` year and `transform` to apply a function element-wise to groups.

In [29]:
auto.groupby('yr')['mpg'].transform(zscore).head()

0    0.058125
1   -0.503753
2    0.058125
3   -0.316460
4   -0.129168
Name: mpg, dtype: float64

Where we have computations that involve more than one column, we use the `apply` method.

In [30]:
# returns dataframe with yr, name and transformed mpg
def zscore_with_year_and_name(group):
    df = pd.DataFrame({
        'mpg': zscore(group['mpg']),
        'year': group['yr'],
        'name': group['name']
    })
    return df

In [31]:
auto.groupby('yr').apply(zscore_with_year_and_name).head()

Unnamed: 0,mpg,year,name
0,0.058125,70,chevrolet chevelle malibu
1,-0.503753,70,buick skylark 320
2,0.058125,70,plymouth satellite
3,-0.31646,70,amc rebel sst
4,-0.129168,70,ford torino


Applying zscore to the **gapminder** dataset to discover outliers:

In [32]:
gapminder = pd.read_csv('./data/gapminder_tidy.csv')
print(gapminder.shape)
gapminder.head()

(10111, 8)


Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


Generate gapminder dataframe for data from 2010.

In [33]:
gapminder_2010 = gapminder[gapminder['Year'] == 2010]
gapminder_2010.index = gapminder_2010['Country']
gapminder_2010 = gapminder_2010[
    ['fertility', 'life', 'population', 'child_mortality', 'gdp', 'region']
]
gapminder_2010.head()

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,5.659,59.612,31411743.0,105.0,1637.0,South Asia
Albania,1.741,76.78,3204284.0,16.6,9374.0,Europe & Central Asia
Algeria,2.817,70.615,35468208.0,27.4,12494.0,Middle East & North Africa
Angola,6.218,50.689,19081912.0,182.5,7047.0,Sub-Saharan Africa
Antigua and Barbuda,2.13,75.437,88710.0,9.9,20567.0,America


We want  to normalize the Gapminder data in 2010 for life expectancy and fertility by the z-score per region. Using boolean indexing, we will filter out countries that have high fertility rates and low life expectancy for their region.

In [34]:
standardized = gapminder_2010.groupby('region')[['life', 'fertility']].transform(zscore)

# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]

gm_outliers

Unnamed: 0_level_0,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Guatemala,3.974,71.1,14388929.0,34.5,6849.0,America
Haiti,3.35,45.0,9993247.0,208.8,1518.0,America
Tajikistan,3.78,66.83,6878637.0,52.6,2110.0,Europe & Central Asia
Timor-Leste,6.237,65.952,1124355.0,63.8,1777.0,East Asia & Pacific


## Filling in Missing values using Groupby and Transform

We'll fill in missing `age` values for passengers on the Titanic with the median age from their `gender` and `pclass`. To do this, we'll group by the `sex` and `pclass` columns and transform each group with a custom function to call `.fillna()` and impute the median value.

In [35]:
titanic = pd.read_csv('./data/titanic.csv')

print(titanic.shape)
titanic.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [36]:
copy = titanic.copy()
print(copy.shape)

(1309, 14)


In [37]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.2+ KB


Write a function called impute_median() that fills missing values with the median of a series. 


In [38]:
def impute_median(series):
    return series.fillna(series.median())

In [39]:
# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])

# Impute age and assign to titanic['age']
titanic.age = by_sex_class['age'].transform(impute_median)

print(titanic['age'].tail(10))

1299    27.0
1300    15.0
1301    45.5
1302    25.0
1303    25.0
1304    14.5
1305    22.0
1306    26.5
1307    27.0
1308    29.0
Name: age, dtype: float64


In [40]:
copy['age'] = copy['age'].transform(impute_median)
copy['age'].tail(10)

1299    27.0
1300    15.0
1301    45.5
1302    28.0
1303    28.0
1304    14.5
1305    28.0
1306    26.5
1307    27.0
1308    29.0
Name: age, dtype: float64

The `.apply()` method when used on a groupby object performs an arbitrary function on each of the groups. These functions can be aggregations, transformations or more complex workflows. The `.apply()` method will then combine the results in an intelligent way.

In order to analyze economic disparity within regions of the world using the Gapminder data set for 2010 we'll define a function to compute the aggregate spread of per capita GDP in each region and the individual country's `z-score` of the regional per capita GDP. We'll then select three countries - United States, Great Britain and China - to see a summary of the regional GDP and that country's `z-score` against the regional mean.

In [41]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

In [42]:
# Group gapminder_2010 by 'region': regional
regional = gapminder_2010.groupby('region')

# Apply the disparity function on regional: reg_disp
reg_disp = regional.apply(disparity)

# Print the disparity of 'United States', 'United Kingdom', and 'China'
print(reg_disp.loc[['United States','United Kingdom','China']])

                  z(gdp)  regional spread(gdp)
Country                                       
United States   3.013374               47855.0
United Kingdom  0.572873               89037.0
China          -0.432756               96993.0


## Filtering groupby results

In [43]:
auto = pd.read_csv('./data/auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,US,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,US,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,US,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,US,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,US,ford torino


In [44]:
# compute the average 'mpg' by yr
auto.groupby('yr')['mpg'].mean()

yr
70    17.689655
71    21.111111
72    18.714286
73    17.100000
74    22.769231
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.803704
81    30.185714
82    32.000000
Name: mpg, dtype: float64

What if we want the yearly average of cars only built be Cheverlot?

In order to do so, we need to filter the groupby prior to aggregating.

In [47]:
grp_by_yr = auto.groupby('yr')

print(type(grp_by_yr))
print(type(grp_by_yr.groups))
print(grp_by_yr.groups.keys())

<class 'pandas.core.groupby.groupby.DataFrameGroupBy'>
<class 'dict'>
dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])


The `Groupby` object consits of a series of groups, each of which is a python dictionary. It's keys are the column split on, `yr`, and it's values are the rows of that group.

In [49]:
grp_by_yr.groups[70]

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
           dtype='int64')

As the groupby obj is a dict, we can iterate over it.

In [52]:
for yr, rows_group in grp_by_yr:
    print('Yr {}, mean mpg {:.2f}'.format(yr, rows_group['mpg'].mean()))

Yr 70, mean mpg 17.69
Yr 71, mean mpg 21.11
Yr 72, mean mpg 18.71
Yr 73, mean mpg 17.10
Yr 74, mean mpg 22.77
Yr 75, mean mpg 20.27
Yr 76, mean mpg 21.57
Yr 77, mean mpg 23.38
Yr 78, mean mpg 24.06
Yr 79, mean mpg 25.09
Yr 80, mean mpg 33.80
Yr 81, mean mpg 30.19
Yr 82, mean mpg 32.00


We can filter the group within the `for loop` prior to aggregating (computing mean).

In [53]:
for yr, group in grp_by_yr:
    filtered_avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    print('Yr {}, mean mpg {:.2f}'.format(yr, filtered_avg))

Yr 70, mean mpg 15.67
Yr 71, mean mpg 20.25
Yr 72, mean mpg 15.33
Yr 73, mean mpg 14.83
Yr 74, mean mpg 18.67
Yr 75, mean mpg 17.67
Yr 76, mean mpg 23.25
Yr 77, mean mpg 20.25
Yr 78, mean mpg 23.23
Yr 79, mean mpg 21.67
Yr 80, mean mpg 30.05
Yr 81, mean mpg 23.50
Yr 82, mean mpg 29.00


We can construct a pandas series using a dict comprehension.

In [57]:
chevy_avg_mpg = {yr:group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()\
                 for yr, group in grp_by_yr}

pd.Series(chevy_avg_mpg)

70    15.666667
71    20.250000
72    15.333333
73    14.833333
74    18.666667
75    17.666667
76    23.250000
77    20.250000
78    23.233333
79    21.666667
80    30.050000
81    23.500000
82    29.000000
dtype: float64

Finally, we can perform a one to all comparison. 

In [58]:
chevy = auto['name'].str.contains('chevrolet')
auto.groupby(['yr', chevy])['mpg'].mean()

yr  name 
70  False    17.923077
    True     15.666667
71  False    21.260870
    True     20.250000
72  False    19.120000
    True     15.333333
73  False    17.500000
    True     14.833333
74  False    23.304348
    True     18.666667
75  False    20.555556
    True     17.666667
76  False    21.350000
    True     23.250000
77  False    23.895833
    True     20.250000
78  False    24.136364
    True     23.233333
79  False    25.488462
    True     21.666667
80  False    34.104000
    True     30.050000
81  False    30.433333
    True     23.500000
82  False    32.461538
    True     29.000000
Name: mpg, dtype: float64

`True` are cars made by `chevrolet`, `False` are all other manufacturers. We can see form the results that chevy's had poorer mpg for all yrs bar one, 1976.

By using `.apply()`, we can write functions that filter rows within groups. The `.apply()` method will handle the iteration over individual groups and then re-combine them back into a Series or DataFrame.

We'll take the Titanic data set and analyze survival rates from the `C` deck, which contained the most passengers. To do this we'll group the dataset by `sex` and then use the `.apply()` method on a provided user defined function which calculates the mean survival rates on the 'C' deck:

In [59]:
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)
    return gr.loc[c_passengers, 'survived'].mean()

In [60]:
# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

# Call by_sex.apply with the function c_deck_survival
c_surv_by_sex = by_sex.apply(c_deck_survival)

# Print the survival rates
print(c_surv_by_sex)

sex
female    0.913043
male      0.312500
dtype: float64


We can use `groupby` with the `.filter()` method to remove whole groups of rows from a DataFrame based on a boolean condition.

Take the February sales data and remove entries from companies that purchased less than or equal to 35 Units in the whole month.

First, we'll identify how many units each company bought for verification. Next we'll use the `.filter()` method after grouping by `Company` to remove all rows belonging to companies whose sum over the `Units` column was less than or equal to 35. Finally, verify that the three companies whose total Units purchased were less than or equal to 35 have been filtered out from the DataFrame.

In [61]:
# Read the CSV file into a DataFrame: sales
sales = pd.read_csv('./data/sales3.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g: g['Units'].sum() > 35)
print(by_com_filt)

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64
                       Company   Product  Units
Date                                           
2015-02-02 21:00:00  Mediacore  Hardware      9
2015-02-04 15:30:00  Streeplex  Software     13
2015-02-09 09:00:00  Streeplex   Service     19
2015-02-09 13:00:00  Mediacore  Software      7
2015-02-19 11:00:00  Mediacore  Hardware     16
2015-02-19 16:00:00  Mediacore   Service     10
2015-02-21 05:00:00  Mediacore  Software      3
2015-02-26 09:00:00  Streeplex   Service      4


## Filtering and Grouping with map

Sometimes, we may want to group by a function/transformation of a column. The key here is that the Series is indexed the same way as the DataFrame. We can also mix and match column grouping with Series grouping.

We'll look at survival rates of passengers on the Titanic by `age` and `pclass`. In particular, the goal is to find out what fraction of children under 10 survived in each `pclass`. We'll do this by first creating a boolean array where `True` is passengers under 10 years old and `False` for passengers over 10. We'll use `.map()` to change these values to strings.

Finally, we'll group by the under 10 series and the `pclass` column and aggregate the `survived` column. The `survived` column has the value `1` if the passenger survived and `0` otherwise. The mean of the `survived` column is the fraction of passengers who lived.

In [62]:
# Create the Boolean Series: under10
under10 = (titanic['age'] < 10).map({True: 'under 10', False: 'over 10'})

# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
print(survived_mean_1)

# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, 'pclass'])['survived'].mean()
print(survived_mean_2)

age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64
age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64
