In [134]:
# This file is responsible for scraping data from multiple sources
# into a single dataset that will be used to visualize data and conduct analysis

# The product from this file is the "dataset.csv" file in the repository

In [95]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [96]:
# Countries Life Expectencies (United Nations Development Programme 2019)

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_life_expectancy'

request = requests.get(url) #200
soup = BeautifulSoup(request.text, 'html.parser')

tables = soup.find_all('table', class_ = 'wikitable')
lifeExp = pd.read_html(str(tables))[3]

lifeExp = lifeExp.droplevel(0, axis=1)
lifeExp = lifeExp.rename(columns={'Countries and regions': 'Country'})

In [121]:
lifeExp.head()

Unnamed: 0,Country,All,M,F,Gender life gap
0,Hong Kong,84.9,82.0,87.7,5.7
1,Japan,84.6,81.5,87.7,6.2
2,Italy,84.0,81.9,86.1,4.2
3,Switzerland,83.8,81.9,85.6,3.7
4,Singapore,83.6,81.5,85.7,4.2


In [98]:
# Countries and what continents / regions they are located in 

url = 'https://statisticstimes.com/geography/countries-by-continents.php'

request = requests.get(url) #200
soup = BeautifulSoup(request.text, 'html.parser')

tables = soup.find_all('table', class_ = 'display')
continents = pd.read_html(str(tables))[1]

continents = continents.rename(columns={'Country or Area': 'Country', 'Region 1': 'Region'})
continents = continents.drop(columns=['No','ISO-alpha3 Code','M49 Code','Region 2'])

In [122]:
continents.head()

Unnamed: 0,Country,Region,Continent
0,Afghanistan,Southern Asia,Asia
1,Åland Islands,Northern Europe,Europe
2,Albania,Southern Europe,Europe
3,Algeria,Northern Africa,Africa
4,American Samoa,Polynesia,Oceania


In [108]:
# Countries by infant and under-five mortality rates (World Bank 2019 Estimates)
# Intended to give some indication of healthcare quality (deaths / 1000 live births)

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_infant_and_under-five_mortality_rates'

request = requests.get(url) #200
soup = BeautifulSoup(request.text, 'html.parser')

tables = soup.find_all('table', class_ = 'wikitable')
mortRate = pd.read_html(str(tables))[0]
mortRate = mortRate.rename(columns={'Country or territory': 'Country',
                                    '2019 mortality rate, under-5 (per 1000 live births)': 'U5 Mortality Rate'})

# Fix country names
mortRate['Country'] = mortRate['Country'].str[:-2]

In [123]:
mortRate.head()

Unnamed: 0,Country,U5 Mortality Rate
0,Afghanistan,60.3
1,Albania,9.7
2,Algeria,23.3
3,Andorra,3.0
4,Angola,74.7


In [118]:
# Countries and Global Peace Index (2019-2021 GPI)
# Lower score means country has higher peacefulness

url = 'https://en.wikipedia.org/wiki/Global_Peace_Index'

request = requests.get(url) #200
soup = BeautifulSoup(request.text, 'html.parser')

tables = soup.find_all('table', class_ = 'wikitable')
GPI = pd.read_html(str(tables))[2]

GPI = GPI[['Country', '2019-2021score[13]']]
GPI = GPI.rename(columns={'2019-2021score[13]': 'GPI Score'})

In [124]:
GPI.head()

Unnamed: 0,Country,GPI Score
0,Iceland,1.072
1,New Zealand,1.221
2,Portugal,1.274
3,Austria,1.291
4,Denmark,1.316


So far we have 2019 data regarding the life expectencies in different countries. We also have multiple datasets giving information about these countries such as their geographical location by continent and region, infant and under-five mortality rate as an indicator of general healthcare quality, and Global Peace Index scores for each country. Now, we will join these different data tables into a singular comprehensive set for which we will create visualizations for and analyze relationships between between different factors and the life expectencies in different countries.

In [129]:
final = pd.merge(lifeExp, continents, how = "inner", on = "Country")
final = final.merge(mortRate, how = "inner", on = "Country")
final = final.merge(GPI, how = "inner", on = "Country")

In [131]:
final.head()

Unnamed: 0,Country,All,M,F,Gender life gap,Region,Continent,U5 Mortality Rate,GPI Score
0,Japan,84.6,81.5,87.7,6.2,Eastern Asia,Asia,2.5,1.369
1,Italy,84.0,81.9,86.1,4.2,Southern Europe,Europe,3.1,1.754
2,Switzerland,83.8,81.9,85.6,3.7,Western Europe,Europe,4.0,1.375
3,Singapore,83.6,81.5,85.7,4.2,South-eastern Asia,Asia,2.5,1.347
4,Spain,83.5,80.8,86.2,5.4,Southern Europe,Europe,3.1,1.699


In [133]:
final.to_csv("dataset.csv", index = False)