In [1]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
gapminder = px.data.gapminder()

In [3]:
gapminder.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4


In [4]:
gapminder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
 6   iso_alpha  1704 non-null   object 
 7   iso_num    1704 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 106.6+ KB


In [5]:
# World gdp in 1952 using python sum
sum(gapminder.query("year == 1952")["gdpPercap"])

528989.1985038

In [6]:
# Extracting unique years
years = gapminder['year'].unique()
years

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007])

In [7]:
# World gdp in each year using for loop

for year in years:
    print(f"gdp in year {year} is " + str(sum(gapminder.query(f"year == {year}")["gdpPercap"])))

gdp in year 1952 is 528989.1985038
gdp in year 1957 is 610515.9849724
gdp in year 1962 is 671065.3525586
gdp in year 1967 is 778678.7326506
gdp in year 1972 is 961351.7597701
gdp in year 1977 is 1038469.6317528
gdp in year 1982 is 1067684.0375232
gdp in year 1987 is 1121930.6709476
gdp in year 1992 is 1158522.4099378
gdp in year 1997 is 1290804.9015605
gdp in year 2002 is 1408334.4677854
gdp in year 2007 is 1658570.1984227


In [8]:
# World gdp in each year using aggreagate function
gapminder.groupby("year").agg("sum")[['gdpPercap']]

# [['gdpPercap']]: Double brackets [[]] return data as datframe, whereas single brackets [] give back series data

Unnamed: 0_level_0,gdpPercap
year,Unnamed: 1_level_1
1952,528989.2
1957,610516.0
1962,671065.4
1967,778678.7
1972,961351.8
1977,1038470.0
1982,1067684.0
1987,1121931.0
1992,1158522.0
1997,1290805.0


In [9]:
# Plotting world gdp in each year
px.line(gapminder.groupby("year").agg("sum")[['gdpPercap']])

In [10]:
# Largest GDP od any country each year
gapminder.groupby("year")[['gdpPercap']].agg("max")

Unnamed: 0_level_0,gdpPercap
year,Unnamed: 1_level_1
1952,108382.3529
1957,113523.1329
1962,95458.11176
1967,80894.88326
1972,109347.867
1977,59265.47714
1982,33693.17525
1987,31540.9748
1992,34932.91959
1997,41283.16433


In [11]:
px.line(gapminder.groupby("year").agg("max")[['gdpPercap']])

In [12]:
# Smallest GDP od any country each year
gapminder.groupby("year")[['gdpPercap']].agg("min")

Unnamed: 0_level_0,gdpPercap
year,Unnamed: 1_level_1
1952,298.846212
1957,335.997115
1962,355.203227
1967,349.0
1972,357.0
1977,371.0
1982,424.0
1987,385.0
1992,347.0
1997,312.188423


In [13]:
px.line(gapminder.groupby("year")[['gdpPercap']].agg("min"))

In [14]:
# Ratio of larget to smallest gdps
# Largest GDP od any country each year
largest_gdps = gapminder.groupby("year")[['gdpPercap']].agg("max")
smallest_gdps = gapminder.groupby("year")[['gdpPercap']].agg("min")
ratio_of_largest_to_smallest = largest_gdps / smallest_gdps
ratio_of_largest_to_smallest

Unnamed: 0_level_0,gdpPercap
year,Unnamed: 1_level_1
1952,362.669321
1957,337.869368
1962,268.742242
1967,231.790496
1972,306.296546
1977,159.745221
1982,79.465036
1987,81.92461
1992,100.671238
1997,132.237973


In [15]:
def ratio_of_largest_to_smallest(s):
    return max(s) / min(s)

gapminder.groupby("year")[['gdpPercap']].agg(ratio_of_largest_to_smallest)

# Aggregate function can take any arbitrary function as input

Unnamed: 0_level_0,gdpPercap
year,Unnamed: 1_level_1
1952,362.669321
1957,337.869368
1962,268.742242
1967,231.790496
1972,306.296546
1977,159.745221
1982,79.465036
1987,81.92461
1992,100.671238
1997,132.237973


In [16]:
# Average Life Expectancy by Year
gapminder.groupby('year')[['lifeExp']].mean()

Unnamed: 0_level_0,lifeExp
year,Unnamed: 1_level_1
1952,49.05762
1957,51.507401
1962,53.609249
1967,55.67829
1972,57.647386
1977,59.570157
1982,61.533197
1987,63.212613
1992,64.160338
1997,65.014676


In [17]:
# GDP by continent
gapminder.groupby('continent')[['gdpPercap']].median()

Unnamed: 0_level_0,gdpPercap
continent,Unnamed: 1_level_1
Africa,1192.138217
Americas,5465.509853
Asia,2646.786844
Europe,12081.749115
Oceania,17983.303955


In [18]:
# Aggregating with multiple columns
gapminder.groupby('continent')[['gdpPercap']].agg(['mean', 'median', 'std'])

Unnamed: 0_level_0,gdpPercap,gdpPercap,gdpPercap
Unnamed: 0_level_1,mean,median,std
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,2193.754578,1192.138217,2827.929863
Americas,7136.110356,5465.509853,6396.764112
Asia,7902.150428,2646.786844,14045.373112
Europe,14469.475533,12081.749115,9355.213498
Oceania,18621.609223,17983.303955,6358.983321


In [19]:
# Grouping on numerical conditions
gapminder.groupby(gapminder['pop'] > 300_000_000)[['lifeExp']].mean()

Unnamed: 0_level_0,lifeExp
pop,Unnamed: 1_level_1
False,59.491833
True,58.306267


In [20]:
# Multiple grouping
filtered_gapminder = gapminder[gapminder['continent'].isin(['Americas', 'Asia'])]
filtered_gapminder

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.853030,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.100710,AFG,4
3,Afghanistan,Asia,1967,34.020,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4
...,...,...,...,...,...,...,...,...
1675,"Yemen, Rep.",Asia,1987,52.922,11219340,1971.741538,YEM,887
1676,"Yemen, Rep.",Asia,1992,55.599,13367997,1879.496673,YEM,887
1677,"Yemen, Rep.",Asia,1997,58.020,15826497,2117.484526,YEM,887
1678,"Yemen, Rep.",Asia,2002,60.308,18701257,2234.820827,YEM,887


In [21]:
filtered_gapminder.groupby(['continent', 'country'])[['lifeExp']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp
continent,country,Unnamed: 2_level_1
Americas,Argentina,69.060417
Americas,Bolivia,52.504583
Americas,Brazil,62.2395
Americas,Canada,74.90275
Americas,Chile,67.430917
Americas,Colombia,63.89775
Americas,Costa Rica,70.181417
Americas,Cuba,71.045083
Americas,Dominican Republic,61.5545
Americas,Ecuador,62.816833


In [22]:
# Sorting values
gapminder.sort_values("gdpPercap")

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
334,"Congo, Dem. Rep.",Africa,2002,44.966,55379852,241.165876,COD,180
335,"Congo, Dem. Rep.",Africa,2007,46.462,64606759,277.551859,COD,180
876,Lesotho,Africa,1952,42.138,748747,298.846212,LSO,426
624,Guinea-Bissau,Africa,1952,32.500,580653,299.850319,GNB,624
333,"Congo, Dem. Rep.",Africa,1997,42.587,47798986,312.188423,COD,180
...,...,...,...,...,...,...,...,...
855,Kuwait,Asia,1967,64.624,575003,80894.883260,KWT,414
854,Kuwait,Asia,1962,60.470,358266,95458.111760,KWT,414
852,Kuwait,Asia,1952,55.565,160000,108382.352900,KWT,414
856,Kuwait,Asia,1972,67.712,841934,109347.867000,KWT,414


In [23]:
def get_first_item(s):
    return s.iloc[0]

gapminder.sort_values("gdpPercap").groupby("year").agg(get_first_item)

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap,iso_alpha,iso_num
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1952,Lesotho,Africa,42.138,748747,298.846212,LSO,426
1957,Lesotho,Africa,45.047,813338,335.997115,LSO,426
1962,Burundi,Africa,42.045,2961915,355.203227,BDI,108
1967,Myanmar,Asia,49.379,25870271,349.0,MMR,104
1972,Myanmar,Asia,53.07,28466390,357.0,MMR,104
1977,Myanmar,Asia,56.059,31528087,371.0,MMR,104
1982,Myanmar,Asia,58.056,34680442,424.0,MMR,104
1987,Myanmar,Asia,58.339,38028578,385.0,MMR,104
1992,Myanmar,Asia,59.32,40546538,347.0,MMR,104
1997,"Congo, Dem. Rep.",Africa,42.587,47798986,312.188423,COD,180


In [24]:
gapminder.sort_values("gdpPercap").groupby("year").first()

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap,iso_alpha,iso_num
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1952,Lesotho,Africa,42.138,748747,298.846212,LSO,426
1957,Lesotho,Africa,45.047,813338,335.997115,LSO,426
1962,Burundi,Africa,42.045,2961915,355.203227,BDI,108
1967,Myanmar,Asia,49.379,25870271,349.0,MMR,104
1972,Myanmar,Asia,53.07,28466390,357.0,MMR,104
1977,Myanmar,Asia,56.059,31528087,371.0,MMR,104
1982,Myanmar,Asia,58.056,34680442,424.0,MMR,104
1987,Myanmar,Asia,58.339,38028578,385.0,MMR,104
1992,Myanmar,Asia,59.32,40546538,347.0,MMR,104
1997,"Congo, Dem. Rep.",Africa,42.587,47798986,312.188423,COD,180


In [31]:
# Highest to Lowest Average Life Expectancy
gapminder.groupby("continent")[["lifeExp"]].mean().sort_values("lifeExp", ascending=False)

Unnamed: 0_level_0,lifeExp
continent,Unnamed: 1_level_1
Oceania,74.326208
Europe,71.903686
Americas,64.658737
Asia,60.064903
Africa,48.86533


In [32]:
# Country with highest GDP
gapminder.groupby("country")[["gdpPercap"]].mean().sort_values("gdpPercap", ascending=False).iloc[0].name

'Kuwait'

In [33]:
# Sorting Results of Multiple Aggregates

filtered_gapminder = gapminder.groupby("continent")[["lifeExp"]].agg(["mean", "std", "median"])

# To sort the resulting values by medians from greatest to least, first flatten and rename the columns 
filtered_gapminder.columns = [" ".join(c) for c in filtered_gapminder.columns.to_flat_index()]
filtered_gapminder.sort_values("lifeExp median", ascending=False)

Unnamed: 0_level_0,lifeExp mean,lifeExp std,lifeExp median
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Oceania,74.326208,3.795611,73.665
Europe,71.903686,5.433178,72.241
Americas,64.658737,9.345088,67.048
Asia,60.064903,11.864532,61.7915
Africa,48.86533,9.15021,47.792


In [35]:
# Applying the column_range function
def column_range(col):
    '''
    This function takes in a pandas series and returns
    the range of the series (max - min) as a float.
    '''
    return col.max() - col.min()

gapminder.groupby('continent')[['gdpPercap']].apply(column_range).sort_values('gdpPercap', ascending=False)

Unnamed: 0_level_0,gdpPercap
continent,Unnamed: 1_level_1
Asia,113192.1329
Europe,48383.656975
Americas,41750.015936
Oceania,24395.7718
Africa,21710.045883
