## Getting data from web sources: web scraping from BeautifulSoup

In [None]:
## import some stuff

import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from IPython.core.display import clear_output
import requests
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_Solar_System_objects_by_size'
r = requests.get(url)

In [None]:
r.status_code

In [None]:
r.headers

In [None]:
r.headers['Content-Type']

In [None]:
r.text

In [None]:
page_html = BeautifulSoup(r.text, 'html.parser')
page_html

In [None]:
tables = page_html.find_all('table', {'class': 'wikitable'})
len(tables)

In [None]:
dfs = []

def parse_header(table_rows):
    headers = []
    header_rows = [row for row in table_rows if row.find('th')]
    main_header_row = header_rows[0]
    # print(main_header_row)
    h_els = main_header_row.find_all('th')
    for h_el in h_els:
        rep_col = int(h_el.get('colspan', 1))
        for i in range(rep_col):
            if i > 0:
                headers.append('{}_{}'.format(h_el.text.strip(), i+1))
            else: 
                headers.append(h_el.text.strip())
    return headers

def parse_body(table_rows):
    return None  # TODO implement me!!
        
for table in tables:
    data = []
    rows = table.find_all('tr')
    columns = parse_header(rows)
    dfs.append(
        pd.DataFrame(
            data=parse_body(rows), 
            columns=parse_header(rows)
        )
    )

for df in dfs:
    print(df.head())
    
    

### Getting data from web sources: Web API

In [None]:
BASE_URL = 'https://api.le-systeme-solaire.net/rest'
url = BASE_URL + '/bodies'
res = requests.get(url, params = {'page': '1, 100'})
res_payload = res.json()

In [None]:
len(res_payload['bodies'])

In [None]:
[body.get('englishName') for body in res_payload['bodies']]

In [None]:
import json
sol_syst_bodies = pd.read_json(json.dumps(res_payload['bodies']), orient='records')
sol_syst_bodies.head()

In [None]:
sol_syst_bodies['mass'] 

In [None]:
sol_syst_bodies['mass'] = sol_syst_bodies['mass'].apply(
    lambda el: el['massValue']*10**el['massExponent'] if isinstance(el, dict) else el
)

In [None]:
sol_syst_bodies.describe()