# Data Scraping with BeautifulSoup

This notebook demonstrates how to scrape data using [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/)

*References:*

This notebook is inspired by awesome work as part of [COVID19 Dashboards](https://covid19dashboards.com/)


In [79]:
import os
from urllib import request
import bs4
import pandas as pd
import numpy as np

data_files = (os.path.join(os.path.dirname(__file__), 'data_files') if '__file__' in locals() else 'data_files')

In [162]:
class Scraper:
    
    def scrape(self, page):
        
        req = request.Request(page, headers={'User-Agent': 'Mozilla/5.0'})
        source = request.urlopen(req).read()
        soup = bs4.BeautifulSoup(source, 'lxml')        
        table = soup.find_all('table')
        return pd.read_html(str(table))[0]
    
    def parse(self, df):
        pass
    
    def load(self, **kwargs):
        refresh = kwargs.get('refresh', False)
        if refresh or not os.path.exists(cls.csv_path):
            df = self.scrape(self.page)
            self.parse(df)
        return pd.read_csv(self.csv_path)
    
class WorldPopulation(Scraper):
    
    csv_path = os.path.join(data_files, 'world_population.csv')
    page = 'https://www.worldometers.info/world-population/population-by-country/'
    
    def parse(self, df):
        rename_map = {
            'Country (or dependency)': 'country',
            'Population (2020)': 'population',
            'Land Area (Km²)': 'area',
            'Urban Pop %': 'urban_ratio'
        }
        df_clean = df.rename(columns=rename_map)[rename_map.values()]
        df_clean['urban_ratio'] = pd.to_numeric(df_clean['urban_ratio'].str.extract(r'(\d*)')[0]) / 100
        df_clean.to_csv(self.csv_path, index=None)
    
    

In [163]:
df = WorldPopulation().load(refresh=True)

In [164]:
df.head()

Unnamed: 0,country,population,area,urban_ratio
0,China,1439323776,9388211,0.61
1,India,1380004385,2973190,0.35
2,United States,331002651,9147420,0.83
3,Indonesia,273523615,1811570,0.56
4,Pakistan,220892340,770880,0.35


In [165]:
df.sort_values(['population'], ascending=False)[:15]

Unnamed: 0,country,population,area,urban_ratio
0,China,1439323776,9388211,0.61
1,India,1380004385,2973190,0.35
2,United States,331002651,9147420,0.83
3,Indonesia,273523615,1811570,0.56
4,Pakistan,220892340,770880,0.35
5,Brazil,212559417,8358140,0.88
6,Nigeria,206139589,910770,0.52
7,Bangladesh,164689383,130170,0.39
8,Russia,145934462,16376870,0.74
9,Mexico,128932753,1943950,0.84


In [180]:
class CoronaVirus(Scraper):
    csv_path = os.path.join(data_files, 'coronavirus.csv')
    page = 'https://www.worldometers.info/coronavirus/'
    
    def atof(self, s):
        return pd.to_numeric(s.str.replace('+', '').str.replace(',', ''))
    
    def parse(self, df):
        rename_map = {
            'Country,Other': 'Country/Other'
        }
        df_clean = df.rename(columns=rename_map)
        df_clean['NewCases'] = self.atof(df_clean['NewCases'])
        df_clean['NewDeaths'] = self.atof(df_clean['NewDeaths'])
        df_clean = df_clean[df_clean['Country/Other'] != 'Total:']
        df_clean.to_csv(self.csv_path, index=None)


In [181]:
df = CoronaVirus().load(refresh=True)

In [182]:
df.head()

Unnamed: 0,Country/Other,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop
0,World,1421442,75438.0,81659.0,7005.0,301520.0,1038263,47915.0,182.0,10.5,,
1,USA,393782,26778.0,12692.0,1821.0,21561.0,359529,9169.0,1190.0,38.0,2058737.0,6220.0
2,Spain,140617,3942.0,13912.0,571.0,43208.0,83497,7069.0,3008.0,298.0,355000.0,7593.0
3,Italy,135586,3039.0,17127.0,604.0,24392.0,94067,3792.0,2243.0,283.0,755445.0,12495.0
4,France,109069,11059.0,10328.0,1417.0,19337.0,79404,7131.0,1671.0,158.0,224254.0,3436.0


In [184]:
df.sort_values(['TotalCases'], ascending=False).head()

Unnamed: 0,Country/Other,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop
0,World,1421442,75438.0,81659.0,7005.0,301520.0,1038263,47915.0,182.0,10.5,,
1,USA,393782,26778.0,12692.0,1821.0,21561.0,359529,9169.0,1190.0,38.0,2058737.0,6220.0
2,Spain,140617,3942.0,13912.0,571.0,43208.0,83497,7069.0,3008.0,298.0,355000.0,7593.0
3,Italy,135586,3039.0,17127.0,604.0,24392.0,94067,3792.0,2243.0,283.0,755445.0,12495.0
4,France,109069,11059.0,10328.0,1417.0,19337.0,79404,7131.0,1671.0,158.0,224254.0,3436.0


# References

https://www.crummy.com/software/BeautifulSoup/

https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

https://stackoverflow.com/questions/41495052/scrapy-reactor-not-restartable