### 1. Importing needed libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os

### 2. Calling on url and see layout

In [2]:
url = "https://en.wikipedia.org/wiki/World_Happiness_Report"
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")


### 3. Selecting correct table 

In [3]:
table = soup.find_all('table',{'class':'wikitable sortable'})[0]

### 4. Checking on rows, layout, data and selecting needed info

In [4]:
rows = table.find_all('tr')
#rows = [row.text.strip().split("\n") for row in rows]
rows = [row.text.replace("\n\n", ",").replace("\n", ",").replace("\xa0", "").strip(",").split(",") for row in rows]
rows [:2]

[['Overall rank',
  'Country or region',
  'Score',
  'GDP per capita',
  'Social support',
  'Healthy life expectancy',
  'Freedom to make life choices',
  'Generosity',
  'Perceptions of corruption'],
 ['1',
  'Finland',
  '7.632',
  '1.305',
  '1.592',
  '0.874',
  '0.681',
  '0.202',
  '0.393']]

### 5. Creating the dataframe

In [5]:
colnames = rows[0]
data = rows[1:]

happiness_2018_extended = pd.DataFrame(data, columns=colnames)
happiness_2018_extended.head(20)


Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357
5,6,Netherlands,7.441,1.361,1.488,0.878,0.638,0.333,0.295
6,7,Canada,7.328,1.33,1.532,0.896,0.653,0.321,0.291
7,8,New Zealand,7.324,1.268,1.601,0.876,0.669,0.365,0.389
8,9,Sweden,7.314,1.355,1.501,0.913,0.659,0.285,0.383
9,10,Australia,7.272,1.34,1.573,0.91,0.647,0.361,0.302


### 6. Basic data cleaning + data selecting

In [6]:
os.getcwd()
os.chdir("../Data")

#####  saving original to csv (to avoid data loss)

In [7]:
happiness_2018_extended.to_csv("backup_raw_data/happiness_2018_extended.csv")

##### selecting countries for research

In [8]:
# countries selected for use:
# spain, netherlands, france, denmak, sweden, italy, poland, belgium, greece, austria

In [9]:
country_list = ["Spain", "Netherlands", "France", "Greece", "Austria", "Denmark", "Belgium", "Italy", "Poland", "Sweden"]

In [11]:
happiness_2018 = pd.read_csv("../Data/backup_raw_data/happiness_2018_extended.csv")
happiness_2018 = happiness_2018[(happiness_2018["Country or region"].isin(country_list))]

In [12]:
happiness_2018 = happiness_2018.rename(columns={"Country or region":"Country",
                                                "Happiness score":"Score",
                                                "Generosity":"Generosity importance", 
                                                "GDP per capita":"GDP", 
                                                "Freedom to make life choices":"Freedom"})
# no need to have the rank and the unnamed as we will not analyse by rank but by score:
happiness_2018 = happiness_2018.drop(["Overall rank"], axis=1)
happiness_2018 = happiness_2018.drop(["Unnamed: 0"], axis=1)

In [13]:
happiness_2018

Unnamed: 0,Country,Score,GDP,Social support,Healthy life expectancy,Freedom,Generosity importance,Perceptions of corruption
2,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
5,Netherlands,7.441,1.361,1.488,0.878,0.638,0.333,0.295
8,Sweden,7.314,1.355,1.501,0.913,0.659,0.285,0.383
11,Austria,7.139,1.341,1.504,0.891,0.617,0.242,0.224
15,Belgium,6.927,1.324,1.483,0.894,0.583,0.188,0.24
22,France,6.489,1.293,1.466,0.908,0.52,0.098,0.176
35,Spain,6.31,1.251,1.538,0.965,0.449,0.142,0.074
41,Poland,6.123,1.176,1.448,0.781,0.546,0.108,0.064
46,Italy,6.0,1.264,1.501,0.946,0.281,0.137,0.028
78,Greece,5.358,1.154,1.202,0.879,0.131,0.0,0.044


##### save new to csv to be able to work on it

In [14]:
happiness_2018.to_csv("../Data/happiness_2018.csv")