In [22]:
import pandas as pd
import numpy as np

df = pd.read_csv('large_countries_2015.csv', index_col=0)

df['population'] = df['population'] / 1000000
df['population'] = round(df['population'], 1)


# Calculate the average population size of the large countries
average_population = df['population'].mean()


# Calculate the average population size by continent
avg_pop_per_continent = round(df.groupby('continent')['population'].mean(), 2)

In [28]:
average_population, avg_pop_per_continent

(375.34999999999997,
 continent
 Africa           182.20
 Asia             503.13
 Europe           143.50
 North America    224.40
 South America    207.80
 Name: population, dtype: float64)

In [23]:
# 1. by column
g1 = df.groupby('continent')
g1.groups

# 2. by an array of equal length
industrialized = np.array([False, True, True, True, False, True, True, False, False, False, True, True])
g2 = df.groupby(industrialized)
g2.groups

# 3. by a Dictionary with keys on the Index
language = {'Bangladesh':'BN', 'Brazil':'PT', 'China':'CN',
            'India':'BN', 'Indonesia':'MS', 'Japan':'JP',
            'Mexico':'ES', 'Nigeria':'NG', 'Pakistan':'UR',
            'Philippines':'PP', 'Russia':'RU', 'United States':'EN'}
g3 = df.groupby(language)
g3.groups

# 4. by a function
g4 = df.groupby(len)
g4.groups

# 5. a list of the above
g5 = df.groupby(['continent', language, len])
g5.groups

# 6. group along the x-axis
g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)
g6.groups

{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [24]:
for i, df_group in df.groupby('continent'):
    print(i, df_group, '\n')

Africa          population  fertility continent
Nigeria       182.2       5.89    Africa 

Asia              population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia 

Europe         population  fertility continent
Russia       143.5       1.61    Europe 

North America                population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America 

South America         population  fertility      continent
Brazil       207.8       1.78  South America 



In [25]:
g = df.groupby('continent')

# standard aggregation functions
g.mean()
g.max()
g.min()
g.sum()
g.count()
g.std()
g.median()
g.quantile(0.9)
g.describe()

# Aggregation with selecting columns
g['population'].describe()

# Aggregation with a list of function names
g.agg(['count', 'mean', 'std'])
g.agg([('Total', 'sum')])        # includes label

# custom aggregation function with parameter
def sum_greater(dataframe, threshold):
    for column in dataframe.columns:
        return dataframe[dataframe[column]>threshold].sum()
    
g.agg(sum_greater, threshold=200)

Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.0,0.0
Asia,2944.7,6.28
Europe,0.0,0.0
North America,321.8,1.97
South America,207.8,1.78


In [26]:
# Transformation by function name
g.transform('mean')

# Transformation by function reference
g.transform(len)

# Transformation with your own function
def normalize(array):
    return array - array.mean()

g.transform(normalize)

# apply any function
def first_two(df):
    return df.head(2)

g.apply(first_two)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,fertility,continent
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Nigeria,182.2,5.89,Africa
Asia,Bangladesh,161.0,2.12,Asia
Asia,China,1376.0,1.57,Asia
Europe,Russia,143.5,1.61,Europe
North America,Mexico,127.0,2.13,North America
North America,United States,321.8,1.97,North America
South America,Brazil,207.8,1.78,South America
