### This notebook demonstrates webscraping in Python.

In [None]:
# load the necessary libraries. BeautifulSoup is a common library used for webscraping. 

In [10]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import html5lib
from bs4 import BeautifulSoup

#### The first section is to demonstrate a sample portion of the website html content, its structure, and how beautiful soup can be used to read the html. 

In [20]:
html = "<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3> \
<b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p> \
<h3>Stephen Curry</h3><p> Salary: $85,000,000</p> \
<h3>Kevin Durant</h3><p> Salary: $73,200,000</p></body></html>"

In [21]:
# Instantiate the html document as soup
soup = BeautifulSoup(html, 'html5lib')

In [22]:
# Using prettify to view the html structure
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000,000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200,000
  </p>
 </body>
</html>



In [23]:
# Assign tag object to the html title and determine type
tag_object = soup.title
print("tag object:", tag_object)

tag object: <title>Page Title</title>


In [24]:
print("tag object type:", type(tag_object))

tag object type: <class 'bs4.element.Tag'>


In [None]:
# Assign another tab object to the h3 component <h3>

In [25]:
tag_object = soup.h3
tag_object

<h3> <b id="boldest">Lebron James</b></h3>

In [None]:
# get child of the tag object

In [26]:
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [None]:
# get parent of the child object - which will be similar to tab_object

In [27]:
parent_tag = tag_child.parent
parent_tag

<h3> <b id="boldest">Lebron James</b></h3>

In [28]:
tag_object.parent

<body><h3> <b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p> <h3>Stephen Curry</h3><p> Salary: $85,000,000</p> <h3>Kevin Durant</h3><p> Salary: $73,200,000</p></body>

In [29]:
sibling_1 = tag_object.next_sibling
sibling_1

<p> Salary: $ 92,000,000 </p>

In [30]:
sibling_2 = sibling_1.next_sibling
sibling_2

' '

In [31]:
sibling_2.next_sibling.next_sibling # salary of steph curry 

<p> Salary: $85,000,000</p>

In [32]:
# id is an attribute of tag b 

In [33]:
tag_child.attrs

{'id': 'boldest'}

In [34]:
tag_child['id']

'boldest'

In [35]:
tag_child.get('id') # We can also obtain the content of the attribute of the tag using the Python get() method.

'boldest'

In [36]:
tag_string = tag_child.string
tag_string

'Lebron James'

In [37]:
unicode_string = str(tag_string) ## convert beautifulsoup string to Python string
unicode_string

'Lebron James'

#### The second section is to demonstrate a sample portion of a table structure in html and how beautiful soup can be utilized. 

In [43]:
# Instantiate the html table

In [44]:
table = "<table><tr><td id='flight'>Flight No</td><td>Launch site</td> \
<td>Payload mass</td></tr><tr> <td>1</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td> \
<td>300 kg</td></tr><tr><td>2</td> \
<td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td> \
<td>94 kg</td></tr><tr><td>3</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td> \
<td>80 kg</td></tr></table>"

In [45]:
table_bs = BeautifulSoup(table, 'html5lib')

In [46]:
table_bs

<html><head></head><body><table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr><tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr><tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr></tbody></table></body></html>

In [47]:
# find all tags including tr (usually an indicator of table)

In [48]:
table_rows = table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>]

In [49]:
first_row = table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [50]:
first_row.next_sibling.td

<td>1</td>

In [53]:
# print all rows to view the table structure in row form
for i, row in enumerate(table_rows):
    print("row", i, "is", row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>
row 2 is <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>
row 3 is <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>


In [54]:
for i, row in enumerate(table_rows):
    print("row", i)
    cells = row.find_all('td')
    for j, cell in enumerate(cells):
        print('column', j, "cell", cell)

row 0
column 0 cell <td id="flight">Flight No</td>
column 1 cell <td>Launch site</td>
column 2 cell <td>Payload mass</td>
row 1
column 0 cell <td>1</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
column 2 cell <td>300 kg</td>
row 2
column 0 cell <td>2</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
column 2 cell <td>94 kg</td>
row 3
column 0 cell <td>3</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
column 2 cell <td>80 kg</td>


In [55]:
list_input = table_bs.find_all(["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

In [57]:
# find all references to Florida wiki

In [58]:
list_input = table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [59]:
table_bs.find_all('a', href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [60]:
table_bs.find_all('a', href=False)

[<a></a>, <a> </a>]

In [61]:
soup.find_all(id="boldest")

[<b id="boldest">Lebron James</b>]

In [62]:
table_bs.find_all(string="Florida")

['Florida', 'Florida']

#### This section applies the method above to scrape data from the web. 

In [78]:
# define url and get request

In [77]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [79]:
data = requests.get(url).text

In [80]:
soup = BeautifulSoup(data, "html5lib") # instantiate the soup object

In [81]:
# find a html table in the web page
table = soup.find('table')  # in html table is represented by the tag <table>

In [82]:
# Get all rows from the table
for row in table.find_all('tr'):  # in html table row represented by tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td')  # in html a column is represented by tag <td>
    color_name = cols[2].string  # store the value in column 3 as color_name
    color_code = cols[3].text  # store the value in column 4 as color_code
    print("{}--->{}".format(color_name, color_code))

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF


In [83]:
# the code above can now be processed using pandas dataframe as needed.

In [85]:
# the below approach uses lxml library instead

In [86]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [87]:
import lxml ## used for reading url in Pandas

In [88]:
tables = pd.read_html(url)

In [89]:
print(tables)

[         0      1               2                 3                     4
0   Number  Color      Color Name  Hex Code #RRGGBB  Decimal Code (R,G,B)
1        1    NaN     lightsalmon           #FFA07A      rgb(255,160,122)
2        2    NaN          salmon           #FA8072      rgb(250,128,114)
3        3    NaN      darksalmon           #E9967A      rgb(233,150,122)
4        4    NaN      lightcoral           #F08080      rgb(240,128,128)
5        5    NaN           coral           #FF7F50       rgb(255,127,80)
6        6    NaN          tomato           #FF6347        rgb(255,99,71)
7        7    NaN       orangered           #FF4500         rgb(255,69,0)
8        8    NaN            gold           #FFD700        rgb(255,215,0)
9        9    NaN          orange           #FFA500        rgb(255,165,0)
10      10    NaN      darkorange           #FF8C00        rgb(255,140,0)
11      11    NaN     lightyellow           #FFFFE0      rgb(255,255,224)
12      12    NaN    lemonchiffon    

In [90]:
# get the first element of the list only
tables[0]

Unnamed: 0,0,1,2,3,4
0,Number,Color,Color Name,Hex Code #RRGGBB,"Decimal Code (R,G,B)"
1,1,,lightsalmon,#FFA07A,"rgb(255,160,122)"
2,2,,salmon,#FA8072,"rgb(250,128,114)"
3,3,,darksalmon,#E9967A,"rgb(233,150,122)"
4,4,,lightcoral,#F08080,"rgb(240,128,128)"
5,5,,coral,#FF7F50,"rgb(255,127,80)"
6,6,,tomato,#FF6347,"rgb(255,99,71)"
7,7,,orangered,#FF4500,"rgb(255,69,0)"
8,8,,gold,#FFD700,"rgb(255,215,0)"
9,9,,orange,#FFA500,"rgb(255,165,0)"


### Author: Paul John Julongbayan