In [265]:
import os
import csv
import requests
import pandas as pd
from sqlalchemy import create_engine
from bs4 import BeautifulSoup as BS  


In [266]:
#data source #1
world_happiness_file = os.path.join("Resources", "WHR20_DataForFigure2.1.csv")


In [267]:
#data source #2
url1 = 'https://www.worldometers.info/world-population/population-by-country/'

#data source #3
url2 = 'https://www.worldometers.info/demographics/life-expectancy/#countries-ranked-by-life-expectancy'

In [268]:
#Function to call URLs
def html_df(url):
    response = requests.get(url)
    soup = BS(response.text,"html.parser")
    res_table = str(soup.find("table"))
    tables = pd.read_html(res_table)[0]
    return tables.drop(columns=["#"])

In [269]:
world_happiness = pd.read_csv(world_happiness_file)
population = html_df(url1)
life_expectancy = html_df(url2)

In [270]:
world_happiness.columns

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual'],
      dtype='object')

In [271]:
# rename columns
world_happiness = world_happiness.rename(columns={"Country name":"country","Ladder score": "happiness_score"})

In [272]:
# select required columns
world_happiness_df = world_happiness[["country", "happiness_score"]]

In [273]:
world_happiness_df

Unnamed: 0,country,happiness_score
0,Finland,7.8087
1,Denmark,7.6456
2,Switzerland,7.5599
3,Iceland,7.5045
4,Norway,7.4880
...,...,...
148,Central African Republic,3.4759
149,Rwanda,3.3123
150,Zimbabwe,3.2992
151,South Sudan,2.8166


In [274]:
#check datatypes to maintain consistency across all dataframes
world_happiness.dtypes

country                                        object
Regional indicator                             object
happiness_score                               float64
Standard error of ladder score                float64
upperwhisker                                  float64
lowerwhisker                                  float64
Logged GDP per capita                         float64
Social support                                float64
Healthy life expectancy                       float64
Freedom to make life choices                  float64
Generosity                                    float64
Perceptions of corruption                     float64
Ladder score in Dystopia                      float64
Explained by: Log GDP per capita              float64
Explained by: Social support                  float64
Explained by: Healthy life expectancy         float64
Explained by: Freedom to make life choices    float64
Explained by: Generosity                      float64
Explained by: Perceptions of

In [275]:
population.columns

Index(['Country (or dependency)', 'Population (2020)', 'Yearly Change',
       'Net Change', 'Density (P/Km²)', 'Land Area (Km²)', 'Migrants (net)',
       'Fert. Rate', 'Med. Age', 'Urban Pop %', 'World Share'],
      dtype='object')

In [276]:
#rename columns
population = population.rename(columns={"Country (or dependency)":"country","Population (2020)":"population_2020","Migrants (net)":"migrants_net","World Share":"world_share_percent"})

In [277]:
population_df = population[["country","population_2020","migrants_net","world_share_percent"]]

In [278]:
population_df

Unnamed: 0,country,population_2020,migrants_net,world_share_percent
0,China,1439323776,-348399.0,18.47 %
1,India,1380004385,-532687.0,17.70 %
2,United States,331002651,954806.0,4.25 %
3,Indonesia,273523615,-98955.0,3.51 %
4,Pakistan,220892340,-233379.0,2.83 %
...,...,...,...,...
230,Montserrat,4992,,0.00 %
231,Falkland Islands,3480,,0.00 %
232,Niue,1626,,0.00 %
233,Tokelau,1357,,0.00 %


In [293]:
#check datatypes to maintain consistency across all dataframes
population_df.dtypes

country                 object
population_2020          int64
migrants_net           float64
world_share_percent     object
dtype: object

In [298]:
# convert the world_share_percent from object datatype to string
population_df = population_df.astype({"world_share_percent": str})


In [299]:
# strip the percent symbol from the world share column and store their face value 
population_df['world_share_percent'] = (population_df['world_share_percent'].str[:-1].astype(float))

In [300]:
population_df.dtypes

country                 object
population_2020          int64
migrants_net           float64
world_share_percent    float64
dtype: object

In [282]:
population_df

Unnamed: 0,country,population_2020,migrants_net,world_share_percent
0,China,1439323776,-348399.0,18.47
1,India,1380004385,-532687.0,17.70
2,United States,331002651,954806.0,4.25
3,Indonesia,273523615,-98955.0,3.51
4,Pakistan,220892340,-233379.0,2.83
...,...,...,...,...
230,Montserrat,4992,,0.00
231,Falkland Islands,3480,,0.00
232,Niue,1626,,0.00
233,Tokelau,1357,,0.00


In [283]:
life_expectancy.columns

Index(['Country', 'Life Expectancy (both sexes)', 'Females Life Expectancy',
       'Males Life Expectancy'],
      dtype='object')

In [284]:
life_expectancy = life_expectancy.rename(columns={"Country":"country","Life Expectancy (both sexes)": "life_expectancy_both_sexes"})

In [285]:
life_expectancy_df = life_expectancy[["country","life_expectancy_both_sexes"]]

In [286]:
life_expectancy_df

Unnamed: 0,country,life_expectancy_both_sexes
0,Hong Kong,85.29
1,Japan,85.03
2,Macao,84.68
3,Switzerland,84.25
4,Singapore,84.07
...,...,...
195,Sierra Leone,55.92
196,Nigeria,55.75
197,Lesotho,55.65
198,Chad,55.17


In [287]:
#check datatypes to maintain consistency across all dataframes
life_expectancy_df.dtypes

country                        object
life_expectancy_both_sexes    float64
dtype: object

In [288]:
#how="inner" only shows values they have in common (no Nan values)
#pop_life_exp = life_expectancy.merge(population, how="inner", on="Country")

In [289]:
#pop_life_exp

In [290]:
#world_happiness = pop_life_exp.merge(world_happiness_df, how="inner", on="Country")
#world_happiness

In [291]:
#world_happiness_2020 = world_happiness[["Country","Life Expectancy (both sexes)","Population (2020)","Ladder score"]]
#.set_index("...")
#world_happiness_2020

In [18]:
db_connection_string = "postgres:jaigurudev@localhost:5432/etl_project_db"
engine = create_engine(f'postgresql://{db_connection_string}')