# Web scraping

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Health Care Ranking for Different European Countries

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv
import re
import urllib.request as urllib2
from datetime import datetime
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [3]:
url = 'https://en.wikipedia.org/wiki/Healthcare_in_Europe'
r = requests.get(url)
HCE = BeautifulSoup(r.text)
type(HCE)

bs4.BeautifulSoup

In [4]:
webpage = urllib2.urlopen(url)
htmlpage = webpage.readlines()
lst = []
for line in htmlpage:
    line = str(line).rstrip()
    if re.search('table class',line):
        lst.append(line)

In [5]:
len(lst)

5

In [6]:
lst

['b\'<table class="wikitable floatright sortable" style="font-size: 90%">\\n\'',
 'b\'<div class="navbox-styles nomobile"><style data-mw-deduplicate="TemplateStyles:r1061467846">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;bor

In [7]:
table = HCE.find('table',{'class','wikitable floatright sortable'})

In [8]:
type(table)

bs4.element.Tag

In [9]:
x=lst[0]
extr = re.findall('"([^"]*)"',x)
table = HCE.find('table',{'class',extr[0]})

In [10]:
type(table)

bs4.element.Tag

In [11]:
headers = [header.text for header in table.find_all('th')]

In [12]:
headers

['WorldRank\n', 'EURank\n', 'Country\n', 'Life expectancyat birth (years)\n']

In [13]:
rows = []
for row in table.find_all('tr'):
    rows.append([val.text.encode('utf8').decode() for val in row.find_all('td')])

In [14]:
df1 = pd.DataFrame(rows, columns = headers)

In [15]:
df1.head(7)

Unnamed: 0,WorldRank\n,EURank\n,Country\n,Life expectancyat birth (years)\n
0,,,,
1,5.\n,1.\n,Spain\n,83.4\n
2,6.\n,2.\n,Italy\n,83.4\n
3,11.\n,3.\n,Sweden\n,82.7\n
4,12.\n,4.\n,France\n,82.5\n
5,13.\n,5.\n,Malta\n,82.4\n
6,16.\n,6.\n,Ireland\n,82.1\n


# Health Expenditure

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita'
r = requests.get(url)
HCE = BeautifulSoup(r.text)
webpage = urllib2.urlopen(url)
htmlpage = webpage.readlines()
lst = []
for line in htmlpage:
    line = str(line).rstrip()
    if re.search('table class',line):
        lst.append(line)
x=lst[1]
print(x)
extr = re.findall('"([^"]*)"',x)
table = HCE.find('table',{'class',extr[0]})
headers = [header.text for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
    rows.append([val.text.encode('utf8').decode() for val in row.find_all('td')])
headers = [i.replace("\n","") for i in headers]
df2 = pd.DataFrame(rows,columns = headers)

b'<table class="wikitable sortable static-row-numbers plainrowheaders srn-white-background" border="1" style="text-align:right;">\n'


In [17]:
df2.head()

Unnamed: 0,Country,2017,2018,2019
0,,,,
1,Australia *\n,"4,711\n","4,965\n","5,187\n"
2,Austria *\n,"5,360\n","5,538\n","5,851\n"
3,Belgium *\n,"5,014\n","5,103\n","5,428\n"
4,Canada *\n,"5,155\n","5,287\n","5,418\n"


# Aditional Preprocessing Steps

In [18]:
def preproc(dat):
    dat.dropna(axis =0,how='all',inplace = True)
    dat.columns = dat.columns.str.replace("\n","")
    dat.replace(["\n"],[""], regex = True, inplace = True)
    dat.replace([r"\s\*$"],[""], regex = True, inplace = True)
    dat.replace([","],[""], regex = True, inplace = True)
    dat.replace(r"\b[a-zA-Z]\b",np.nan, regex = True, inplace = True)
    dat.replace([r"^\s"],[""], regex = True, inplace = True)
    dat = dat.apply(pd.to_numeric, errors = 'ignore')
    return(dat)

In [19]:
df1 = preproc(df1)
df2 = preproc(df2)

In [20]:
print(df1.isnull().sum().sum())
print(df2.isnull().sum().sum())

0
0


In [21]:
df1.dropna(axis=0, how ='any', inplace = True)

In [22]:
df1.dtypes

WorldRank                          float64
EURank                             float64
Country                             object
Life expectancyat birth (years)    float64
dtype: object

In [23]:
df2.dtypes

Country    object
2017        int64
2018        int64
2019        int64
dtype: object

In [24]:
df1.columns = ['WorldRank','EYRank','Country','Life expectancy in (years)']
df2.columns = ['Country','2017','2018','2019']

# Analyzing Final Tables

In [25]:
df1.head()

Unnamed: 0,WorldRank,EYRank,Country,Life expectancy in (years)
1,5.0,1.0,Spain,83.4
2,6.0,2.0,Italy,83.4
3,11.0,3.0,Sweden,82.7
4,12.0,4.0,France,82.5
5,13.0,5.0,Malta,82.4


In [26]:
df2.head()

Unnamed: 0,Country,2017,2018,2019
1,Australia,4711,4965,5187
2,Austria,5360,5538,5851
3,Belgium,5014,5103,5428
4,Canada,5155,5287,5418
5,Chile,2030,2126,2159


# Merging different Data

In [27]:
print(df1.shape)
print(df2.shape)

(9, 4)
(37, 4)


In [28]:
pd.merge(df1,df2, how='left', on='Country').head()

Unnamed: 0,WorldRank,EYRank,Country,Life expectancy in (years),2017,2018,2019
0,5.0,1.0,Spain,83.4,3322.0,3430.0,3616.0
1,6.0,2.0,Italy,83.4,3399.0,3485.0,3649.0
2,11.0,3.0,Sweden,82.7,5318.0,5434.0,5782.0
3,12.0,4.0,France,82.5,5057.0,5154.0,5376.0
4,13.0,5.0,Malta,82.4,,,
