# Introduction
A small analysis about life expectancy, population and GDP per capita using a dataset provided by Gapminder Foundation.

The main goals of this project is use pandas and plotly to do descriptive and exploratory analysis and improve the quality of dates with web scraping.

In [225]:
import pandas as pd
import urllib as urlr
from bs4 import BeautifulSoup
import numpy as np
import plotly.express as px


In [226]:
req = urlr.request.Request('https://www.sport-histoire.fr/en/Geography/Countries_by_alphabetical_order.php', 
                            headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
html = urlr.request.urlopen(req)
bs = BeautifulSoup(html, 'html.parser')

In [227]:
bs.find_all('thead')

[<thead><tr><th>Country</th><th>Capital</th><th>Continent</th></tr></thead>]

In [228]:
paises = bs.find_all('tr')

In [229]:
paises.pop(0)

<tr><th>Country</th><th>Capital</th><th>Continent</th></tr>

In [230]:
nome_pais = []
continente = []
capital = []
for information in paises:
    pais = information.get_text(separator='\n').splitlines()
    nome_pais.append(pais[0])
    capital.append(pais[1])
    continente.append(pais[2])

In [231]:
dados_paises = pd.DataFrame({'pais' : nome_pais, 'continente' : continente, 'capital' : capital})

# Descriptive Analysis 
Knowing the dataset and getting information about the economic and social variables.


In [232]:
df = pd.read_csv('data/Gapminder.csv', sep= ';')

In [233]:
df.head(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.8,8425333,779.45
1,Afghanistan,Asia,1957,30.33,9240934,820.85
2,Afghanistan,Asia,1962,32.0,10267083,853.1
3,Afghanistan,Asia,1967,34.02,11537966,836.2
4,Afghanistan,Asia,1972,36.09,13079460,739.98


In [234]:
dados_paises.head()

Unnamed: 0,pais,continente,capital
0,Afghanistan,Asia,Kabul
1,Albania,Europe,Tirana
2,Algeria,Africa,Algiers
3,Andorra,Europe,Andorra la Vella
4,Angola,Africa,Luanda


In [235]:
print(df.shape, dados_paises.shape)

(3312, 6) (201, 3)


In [236]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [237]:
dados_paises.dtypes

pais          object
continente    object
capital       object
dtype: object

In [238]:
df.isna().sum()

country        0
continent    301
year           0
lifeExp        0
pop            0
gdpPercap      0
dtype: int64

As we can see, the dataset has empty values in continent variable, because of that we are going to use the Web Scraping, to improve this variable.

In [239]:
df = df.merge(dados_paises, how = 'left', left_on = 'country', right_on = 'pais')

In [240]:
df = df.drop('pais', axis = 1)

In [241]:
conditions = [
            (pd.isna(df['continent'])),
            (pd.notna(df['continent']))
        ]

choices = [df['continente'], df['continent']]

df['continente'] = np.select(conditions, choices)

In [242]:
df.isna().sum()

country         0
continent     301
year            0
lifeExp         0
pop             0
gdpPercap       0
continente     81
capital       260
dtype: int64

The dataset from the web does not have all countries considered by Gapminder, so we have a few islands without their respective continents.

In [243]:
df.query('continente != continente')['country'].value_counts()

Hong Kong, China         12
Reunion                  12
Sao Tome and Principe    12
French Polynesia          9
New Caledonia             9
Aruba                     8
Micronesia, Fed. Sts.     8
Netherlands Antilles      8
French Guiana             1
Guadeloupe                1
Martinique                1
Name: country, dtype: int64

In [244]:
df['continente'] = df['continente'].str.replace('Americas', 'America')

In [245]:
pd.options.display.float_format = '{:20,.2f}'.format
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,3312.0,3312.0,3312.0,3312.0
mean,1980.3,65.25,31614890.82,11317.12
std,16.93,11.77,104119342.89,11369.14
min,1950.0,23.6,59412.0,241.17
25%,1967.0,58.34,2678572.0,2514.63
50%,1982.0,69.61,7557218.5,7838.51
75%,1996.0,73.66,19585221.75,17357.88
max,2007.0,82.67,1318683096.0,113523.13


## Life expectancy

In [246]:
df.sort_values('lifeExp', ascending = False, ignore_index = True).head(5)[['country', 'year', 'lifeExp', 'continente']]

Unnamed: 0,country,year,lifeExp,continente
0,Japan,2006,82.67,Asia
1,Japan,2007,82.6,Asia
2,Japan,2004,82.36,Asia
3,Japan,2005,82.27,Asia
4,"Hong Kong, China",2007,82.21,


In [247]:
df.sort_values('lifeExp', ascending = False, ignore_index = True).tail(5)[['country', 'year', 'lifeExp', 'continente']]

Unnamed: 0,country,year,lifeExp,continente
3307,Sierra Leone,1952,30.33,Africa
3308,Angola,1952,30.02,Africa
3309,Gambia,1952,30.0,Africa
3310,Afghanistan,1952,28.8,Asia
3311,Rwanda,1992,23.6,Africa


In [248]:
(df.groupby(by = ['country'], as_index = False).min().
    sort_values('lifeExp', ascending = False, ignore_index = True).tail(5)
    [['country', 'year', 'lifeExp', 'continente']])

Unnamed: 0,country,year,lifeExp,continente
182,Sierra Leone,1952,30.33,Africa
183,Angola,1952,30.02,Africa
184,Gambia,1952,30.0,Africa
185,Afghanistan,1952,28.8,Asia
186,Rwanda,1952,23.6,Africa


In [249]:
(df.groupby(by = ['country'], as_index = False).min().
    sort_values('lifeExp', ascending = False, ignore_index = True).head(5)
    [['country', 'year', 'lifeExp', 'continente']])

Unnamed: 0,country,year,lifeExp,continente
0,Martinique,2002,78.78,
1,Guadeloupe,2002,78.41,
2,French Guiana,2002,75.14,
3,Norway,1950,71.6,Europe
4,Netherlands,1950,71.45,Europe


In [250]:
(df.groupby(by = ['country'], as_index = False).max().
    sort_values('lifeExp', ascending = False, ignore_index = True).tail(5)
    [['country', 'year', 'lifeExp', 'continente']])

Unnamed: 0,country,year,lifeExp,continente
182,Rwanda,2007,46.24,Africa
183,Liberia,2007,46.03,Africa
184,Afghanistan,2007,43.83,Asia
185,Angola,2007,42.73,Africa
186,Sierra Leone,2007,42.57,Africa


In [251]:
(df.groupby(by = ['country'], as_index = False).max().
    sort_values('lifeExp', ascending = False, ignore_index = True).head(5)
    [['country', 'year', 'lifeExp', 'continente']])

Unnamed: 0,country,year,lifeExp,continente
0,Japan,2007,82.67,Asia
1,"Hong Kong, China",2007,82.21,
2,Iceland,2007,81.76,Europe
3,Switzerland,2007,81.74,Europe
4,Australia,2007,81.23,Oceania


The disparity is cruel and notable, most of the countries with good life expectancy are in Asia and Europe, but with good heterogeneity. 

Otherwise, the worst countries in this variable are always in Africa, the only exception is Afghanistan, which is in every list of worst life expectancy. 

In [252]:
(df.groupby(by = ['continente'], as_index = False).max().
    sort_values('lifeExp', ascending = False, ignore_index = True)
    [['country', 'year', 'lifeExp', 'continente']])


Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.



Unnamed: 0,country,year,lifeExp,continente
0,"Yemen, Rep.",2007,82.67,Asia
1,United Kingdom,2007,81.76,Europe
2,Vanuatu,2007,81.23,Oceania
3,Venezuela,2007,80.65,America
4,Zimbabwe,2007,73.95,Africa
5,Ukraine,2007,72.96,FSU


In [253]:
(df.groupby(by = ['continente'], as_index = False).min().
    sort_values('lifeExp', ascending = False, ignore_index = True)
    [['country', 'year', 'lifeExp', 'continente']])


Dropping invalid columns in DataFrameGroupBy.min is deprecated. In a future version, a TypeError will be raised. Before calling .min, select only columns which should be valid for the function.



Unnamed: 0,country,year,lifeExp,continente
0,Belarus,1950,57.3,FSU
1,Albania,1950,43.59,Europe
2,Australia,1950,42.52,Oceania
3,Argentina,1950,37.58,America
4,Afghanistan,1950,28.8,Asia
5,Algeria,1950,23.6,Africa


## GDP per capita 

In [254]:
df.sort_values('gdpPercap', ascending = False, ignore_index = True).head(5)[['country', 'year', 'gdpPercap', 'continente']]

Unnamed: 0,country,year,gdpPercap,continente
0,Kuwait,1957,113523.13,Asia
1,Kuwait,1972,109347.87,Asia
2,Kuwait,1952,108382.35,Asia
3,Kuwait,1962,95458.11,Asia
4,Qatar,2007,82010.98,Asia


In [255]:
df.sort_values('gdpPercap', ascending = False, ignore_index = True).tail(5)[['country', 'year', 'gdpPercap', 'continente']]

Unnamed: 0,country,year,gdpPercap,continente
3307,"Congo, Dem. Rep.",1997,312.19,Africa
3308,Guinea-Bissau,1952,299.85,Africa
3309,Lesotho,1952,298.85,Africa
3310,"Congo, Dem. Rep.",2007,277.55,Africa
3311,"Congo, Dem. Rep.",2002,241.17,Africa


As we can see, a low variaty of countries in high and low values of GDP per capita, in high GDP only Kuwait and Qatar from Asia, anda in low only countries from Africa, but the years of low and high are heterogeneity.

In [256]:
(df.groupby(by = ['country'], as_index = False).max().
    sort_values('gdpPercap', ascending = False, ignore_index = True).tail(5)
    [['country', 'year', 'gdpPercap', 'continente']])

Unnamed: 0,country,year,gdpPercap,continente
182,Liberia,2007,803.01,Africa
183,Zimbabwe,2007,799.36,Africa
184,Malawi,2007,759.35,Africa
185,Ethiopia,2007,690.81,Africa
186,Burundi,2007,631.7,Africa


In [257]:
(df.groupby(by = ['country'], as_index = False).max().
    sort_values('gdpPercap', ascending = False, ignore_index = True).head(5)
    [['country', 'year', 'gdpPercap', 'continente']])

Unnamed: 0,country,year,gdpPercap,continente
0,Kuwait,2007,113523.13,Asia
1,Qatar,2007,82010.98,Asia
2,Brunei,2007,72555.23,Asia
3,Luxembourg,2005,70014.0,Europe
4,"Macao, China",2007,54589.82,Asia


In [258]:
(df.groupby(by = ['continente'], as_index = False).mean().
    sort_values('gdpPercap', ascending = False, ignore_index = True)
    [['continente', 'gdpPercap']])

Unnamed: 0,continente,gdpPercap
0,Europe,16551.18
1,Oceania,13242.92
2,America,10527.31
3,Asia,9762.59
4,FSU,7903.07
5,Africa,2138.09


# Population

In [259]:
df.sort_values('pop', ascending = False, ignore_index = True).head(5)[['country', 'year', 'pop', 'continente']]

Unnamed: 0,country,year,pop,continente
0,China,2007,1318683096,Asia
1,China,2002,1280400000,Asia
2,China,1997,1230075000,Asia
3,China,1992,1164970000,Asia
4,India,2007,1110396331,Asia


In [260]:
df.sort_values('pop', ascending = False, ignore_index = True).tail(5)[['country', 'year', 'pop', 'continente']]

Unnamed: 0,country,year,pop,continente
3307,Sao Tome and Principe,1957,61325,
3308,"Micronesia, Fed. Sts.",1972,60427,
3309,Sao Tome and Principe,1952,60011,
3310,Aruba,1972,59461,
3311,Aruba,1977,59412,


In [261]:
(df.groupby(by = ['country'], as_index = False).max().
    sort_values('pop', ascending = False, ignore_index = True).head(5)
    [['country', 'year', 'pop', 'continente']])

Unnamed: 0,country,year,pop,continente
0,China,2007,1318683096,Asia
1,India,2007,1110396331,Asia
2,United States,2007,301139947,America
3,Indonesia,2007,223547000,Asia
4,Brazil,2007,190010647,America


In [262]:
(df.groupby(by = ['continente'], as_index = False).mean().
    sort_values('pop', ascending = False, ignore_index = True)
    [['continente', 'pop']])

Unnamed: 0,continente,pop
0,Asia,94000364.33
1,America,41070439.06
2,FSU,34950663.25
3,Europe,15315943.58
4,Africa,10097526.49
5,Oceania,6275883.32


# Graphics

In [263]:
fig = px.line(df.groupby(by = ['year'], as_index = False).sum(), x="year", y="pop", title='Population in the world across the years', 
                labels = {'pop' : 'Population', 'year' : 'Year'})
fig.show()

Weird graphic.

In [264]:
fig = px.line((df.groupby(by = ['continente', 'year'], as_index = False).mean().
                sort_values('year', ascending = False, ignore_index = True)
                [['year', 'lifeExp', 'continente']]), 
            x="year", y="lifeExp", color = 'continente', title='Life Expectancy in Continents', 
            labels = {'lifeExp' : 'Life Expectancy', 'year' : 'Year', 'continente' : 'Continent'})
fig.show()

In [265]:
fig = px.line((df.groupby(by = ['continente', 'year'], as_index = False).mean().
                sort_values('year', ascending = False, ignore_index = True)
                [['year', 'gdpPercap', 'continente']]), 
            x="year", y="gdpPercap", color = 'continente', title='GDP per capita in Continents',
            labels = {'gdpPercap' : 'GDP per capita', 'year' : 'Year', 'continente' : 'Continent'})
fig.show()

In [266]:
countries = ['Afghanistan', 'Argentina', 'Venezuela', 'Japan', 'Hong Kong, China', 'Algeria', 'Sierra Leone', 'Rwanda']
fig = px.line((df.groupby(by = ['country', 'year'], as_index = False).mean().
                sort_values('year', ascending = False, ignore_index = True)
                [['year', 'lifeExp', 'country']]).query("country in @countries", engine = 'python'), 
            x="year", y="lifeExp", color = 'country', title='Life expectancy in some countries', 
            labels = {'lifeExp' : 'Life Expectancy', 'year' : 'Year', 'country' : 'Countries'})
fig.show()

In [267]:
countries = ['Kuwait', 'Congo, Dem. Rep.', 'Venezuela', 'Korea, Rep.', 'Hong Kong, China', 'Sierra Leone', 'Brazil', 'Qatar']
fig = px.line((df.groupby(by = ['country', 'year'], as_index = False).mean().
                sort_values('year', ascending = False, ignore_index = True)
                [['year', 'gdpPercap', 'country']]).query("country in @countries", engine = 'python'), 
            x="year", y="gdpPercap", color = 'country', title='Population in the world across years')
fig.show()

In [268]:
countries = ['Kuwait', 'Venezuela', 'Korea, Rep.', 'Qatar']
fig = px.line(df.query("country in @countries", engine = 'python'), x="lifeExp", y="gdpPercap", color="country", text="year", 
            labels = {'lifeExp' : 'Life Expectancy', 'gdpPercap' : 'GDP per Capita', 'country' : 'Countries'})
fig.update_traces(textposition="bottom right")
fig.show()

In [269]:
countries = ['Sierra Leone', 'Japan', 'Brazil', 'United States']
fig = px.line(df.query("country in @countries", engine = 'python'), x="lifeExp", y="gdpPercap", color="country", text="year", 
            labels = {'lifeExp' : 'Life Expectancy', 'gdpPercap' : 'GDP per Capita', 'country' : 'Countries'})
fig.update_traces(textposition="bottom right")
fig.show()