This Jupyter notebook focuses on Web Scarpping  


In [110]:
import requests # helps to download a web page
from bs4 import BeautifulSoup # helps in web scrapping

BeautifulSoup is a Python library for pulling data out of HTML and XML files. This is accomplished by representing the HTML as a set of objects with methods used to parse the HTML. We can navigate the HTML as a Tree and filter out what we are looking for.

In [111]:
# storing it as a string 
html = " <!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [112]:
# To parse the documnet pass it through BeautifulSoup constructor. The BeautifulSoup object represents the document as a nested data structure
soup = BeautifulSoup(html, 'html5lib')
soup

<!DOCTYPE html>
<html><head><title>Page Title</title></head><body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>

In [113]:
# to display html as a nested structure
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>



In [114]:
# Lets say we want to get the title of page and the name of the top paid player
tag_object = (soup.title)
print("tag object type:",type(tag_object))

tag object type: <class 'bs4.element.Tag'>


In [115]:
tag_object = soup.h3
tag_object

<h3><b id="boldest">Lebron James</b></h3>

In [116]:
# Since tag object is a tree of objects. We can navigate down the branch as follows
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [117]:
# to access the parent
parent_tag = tag_child.parent
parent_tag

<h3><b id="boldest">Lebron James</b></h3>

In [118]:
tag_object.parent

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [119]:
sibling_tag = tag_object.next_sibling
sibling_tag

<p> Salary: $ 92,000,000 </p>

In [120]:
sibling_2=sibling_tag.next_sibling
sibling_2

<h3> Stephen Curry</h3>

In [121]:
# salary of Steph curry using next sibling
salary_st = sibling_2.next_sibling
salary_st

<p> Salary: $85,000, 000 </p>

In [122]:
# if a tag has attributes, we can access the attributes by treating the tag like dictionary
tag_child['id']

'boldest'

In [123]:
tag_child.attrs

{'id': 'boldest'}

In [124]:
tag_child.get('id')

'boldest'

In [125]:
# NAVIGABLE STRING
# A string corresponds to a bit of text or context within a tag. 
# BeautifulSoup uses NavigableString classto contain this text
tag_string = tag_child.string
tag_string

'Lebron James'

In [126]:
type(tag_string)

bs4.element.NavigableString

In [127]:
# A navigable string is similar to Python String or unicode string. The only difference is that it supports some of the features of BS library.
unicode_string = str(tag_string)
type(unicode_string)

str

In [128]:
# We can store it in variable table
table = "%%html<table><tr><td id='flight' >Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [129]:
table_bs = BeautifulSoup(table, 'html5lib')
table_bs

<html><head></head><body>%%html<table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr></tbody></table></body></html>

In [130]:
# find_all()
# This looks through all tags descendants and retrieves all that matches the filter

In [131]:
table_rows=table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>]

In [132]:
first_row =table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [133]:
print(type(first_row))

<class 'bs4.element.Tag'>


In [134]:
first_row.td

<td id="flight">Flight No</td>

In [135]:
# If we iterate through the list, each element corresponds to row in the table
for i, row in enumerate(table_rows):
    print("row",i,"is",row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>


In [136]:
# Now as row is a cell object, we can simply apply method find_all to it and extract table cells in object cells using tag td.
for i, row in enumerate(table_rows):
    print('row', i)
    cells = row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
colunm 2 cell <td>80 kg</td>


In [137]:
# if we use a list we can match against any item in that list
list_input=table_bs .find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

In [138]:
# ATTRIBUTES
# If the argument is not recoganized it will be turned into a filter o tag's attribute.

In [139]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [140]:
list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [141]:
table_bs.find_all(href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [142]:
# Now find all elements with href value
table_bs.find_all(href = False)

[<html><head></head><body>%%html<table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr></tbody></table></body></html>,
 <head></head>,
 <body>%%html<table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr></tbody></table></body>,
 <table><tbody><tr><td id="flight">Flight No</td><td>Launch site</td> <td>

In [143]:
# Using soup object find the element with id attribute boldest
soup.find_all(id ='boldest')

[<b id="boldest">Lebron James</b>]

In [144]:
# With string you can search for strings instead of tags
table_bs.find_all(string="Florida")

['Florida', 'Florida']

### FIND METHOD
### Previously we noted that find_all() method scans the entire document looking for results
### It is usefull to use find() methos if looking for one element

### consider the following two tables

%%html
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party  </h3>
  
    
<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td> 
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>

In [145]:
two_tables="<h3>Rocket Launch </h3><p><table class='rocket'><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p><p><h3>Pizza Party  </h3><table class='pizza'><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td >144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr>"

In [146]:
two_tables_bs= BeautifulSoup(two_tables, 'html.parser')

In [147]:
two_tables_bs.find("table")

<table class="rocket"><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table>

In [148]:
two_tables_bs.find("table",class_='pizza')

<table class="pizza"><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td>144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr></table>

### EXERCISE  
Downloading and Scrapping the contents of a webpage

In [149]:
data = requests.get("http://www.ibm.com").text
data

'\n<!DOCTYPE HTML>\n<html lang="en-ca">\n<head>\n    \n    \n    \n    \n    <meta charset="UTF-8"/>\n    <meta name="languageCode" content="en"/>\n    <meta name="countryCode" content="ca"/>\n    <meta name="searchTitle" content="IBM - Canada"/>\n    <meta name="focusArea" content="No Contact Module"/>\n    <title>IBM - Canada</title>\n    <link rel="icon" href="/content/dam/adobe-cms/default-images/favicon.svg"/>\n    \n    <meta name="description" content="For more than a century, IBM has been a global technology innovator, leading advances in AI, automation and hybrid cloud solutions that help businesses grow."/>\n    <meta name="template" content="full-width-layout"/>\n    <meta name="viewport" content="width=device-width, initial-scale=1"/>\n    <meta name="robots" content="index, follow"/>\n    \n    <link rel="canonical" href="https://www.ibm.com/ca-en"/>\n    <style id="anti-flicker-style">\n        :not(:defined) {\n          visibility: hidden;\n        }\n    </style>\n    

In [150]:
soup = BeautifulSoup(data,"html5lib")
soup

<!DOCTYPE html>
<html lang="en-ca"><head>
    
    
    
    
    <meta charset="utf-8"/>
    <meta content="en" name="languageCode"/>
    <meta content="ca" name="countryCode"/>
    <meta content="IBM - Canada" name="searchTitle"/>
    <meta content="No Contact Module" name="focusArea"/>
    <title>IBM - Canada</title>
    <link href="/content/dam/adobe-cms/default-images/favicon.svg" rel="icon"/>
    
    <meta content="For more than a century, IBM has been a global technology innovator, leading advances in AI, automation and hybrid cloud solutions that help businesses grow." name="description"/>
    <meta content="full-width-layout" name="template"/>
    <meta content="width=device-width, initial-scale=1" name="viewport"/>
    <meta content="index, follow" name="robots"/>
    
    <link href="https://www.ibm.com/ca-en" rel="canonical"/>
    <style id="anti-flicker-style">
        :not(:defined) {
          visibility: hidden;
        }
    </style>
    <script id="anti-flicker-scrip

In [151]:
# SCRAP ALL LINKS
for link in soup.find_all('a', href = True):
    print(link.get('href'))

https://www.ibm.com/ca-en/cloud?lnk=hpUSbt1


In [160]:
# Scrap all image tags
for link in soup.find_all('img'):# in html image is represented by the tag <img>
    print(link)
    print(link.get('src'))



### SCARP DATA FROM HTML TABLES

In [161]:
#Given a URL having tables with colors and its data in it 333 ROws and 5 columns
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [163]:
data = requests.get(url).text
data

'<html>\n   <body>\n      <h1>Partital List  of HTML5 Supported Colors</h1>\n<table border ="1" class="main-table">\n   <tr>\n      <td>Number </td>\n      <td>Color</td>\n      <td>Color Name</td>\n      <td>Hex Code<br>#RRGGBB</td>\n      <td>Decimal Code<br>(R,G,B)</td>\n   </tr>\n   <tr>\n      <td>1</td>\n      <td style="background:lightsalmon;">&nbsp;</td>\n      <td>lightsalmon</td>\n      <td>#FFA07A</td>\n      <td>rgb(255,160,122)</td>\n   </tr>\n   <tr>\n      <td>2</td>\n      <td style="background:salmon;">&nbsp;</td>\n      <td>salmon</td>\n      <td>#FA8072</td>\n      <td>rgb(250,128,114)</td>\n   </tr>\n   <tr>\n      <td>3</td>\n      <td style="background:darksalmon;">&nbsp;</td>\n      <td>darksalmon</td>\n      <td>#E9967A</td>\n      <td>rgb(233,150,122)</td>\n   </tr>\n   <tr>\n      <td>4</td>\n      <td style="background:lightcoral;">&nbsp;</td>\n      <td>lightcoral</td>\n      <td>#F08080</td>\n      <td>rgb(240,128,128)</td>\n   </tr>\n   <tr>\n      <td>5<

In [164]:
soup = BeautifulSoup(data, 'html5lib')
soup

<html><head></head><body>
      <h1>Partital List  of HTML5 Supported Colors</h1>
<table border="1" class="main-table">
   <tbody><tr>
      <td>Number </td>
      <td>Color</td>
      <td>Color Name</td>
      <td>Hex Code<br/>#RRGGBB</td>
      <td>Decimal Code<br/>(R,G,B)</td>
   </tr>
   <tr>
      <td>1</td>
      <td style="background:lightsalmon;"> </td>
      <td>lightsalmon</td>
      <td>#FFA07A</td>
      <td>rgb(255,160,122)</td>
   </tr>
   <tr>
      <td>2</td>
      <td style="background:salmon;"> </td>
      <td>salmon</td>
      <td>#FA8072</td>
      <td>rgb(250,128,114)</td>
   </tr>
   <tr>
      <td>3</td>
      <td style="background:darksalmon;"> </td>
      <td>darksalmon</td>
      <td>#E9967A</td>
      <td>rgb(233,150,122)</td>
   </tr>
   <tr>
      <td>4</td>
      <td style="background:lightcoral;"> </td>
      <td>lightcoral</td>
      <td>#F08080</td>
      <td>rgb(240,128,128)</td>
   </tr>
   <tr>
      <td>5</td>
      <td style="background:coral;"> </

In [170]:
# FInd a html table in webpage
table = soup.find('table')

In [169]:
# get all rows from the table


[<td>Number </td>,
 <td>Color</td>,
 <td>Color Name</td>,
 <td>Hex Code<br/>#RRGGBB</td>,
 <td>Decimal Code<br/>(R,G,B)</td>,
 <td>1</td>,
 <td style="background:lightsalmon;"> </td>,
 <td>lightsalmon</td>,
 <td>#FFA07A</td>,
 <td>rgb(255,160,122)</td>,
 <td>2</td>,
 <td style="background:salmon;"> </td>,
 <td>salmon</td>,
 <td>#FA8072</td>,
 <td>rgb(250,128,114)</td>,
 <td>3</td>,
 <td style="background:darksalmon;"> </td>,
 <td>darksalmon</td>,
 <td>#E9967A</td>,
 <td>rgb(233,150,122)</td>,
 <td>4</td>,
 <td style="background:lightcoral;"> </td>,
 <td>lightcoral</td>,
 <td>#F08080</td>,
 <td>rgb(240,128,128)</td>,
 <td>5</td>,
 <td style="background:coral;"> </td>,
 <td>coral</td>,
 <td>#FF7F50</td>,
 <td>rgb(255,127,80)</td>,
 <td>6</td>,
 <td style="background:tomato;"> </td>,
 <td>tomato</td>,
 <td>#FF6347</td>,
 <td>rgb(255,99,71)</td>,
 <td>7</td>,
 <td style="background:orangered;"> </td>,
 <td>orangered</td>,
 <td>#FF4500</td>,
 <td>rgb(255,69,0)</td>,
 <td>8</td>,
 <td style=

In [171]:
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].string # store the value in column 3 as color_name
    color_code = cols[3].text # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF


### Scarping tables from a WEB PAGE USING PANDAS

In [176]:
# Particularly for extratcing tabular data from webpage we can use read_html() of pandas
import pandas as pd
tables = pd.read_html(url)


In [175]:
tables[0]

Unnamed: 0,0,1,2,3,4
0,Number,Color,Color Name,Hex Code #RRGGBB,"Decimal Code (R,G,B)"
1,1,,lightsalmon,#FFA07A,"rgb(255,160,122)"
2,2,,salmon,#FA8072,"rgb(250,128,114)"
3,3,,darksalmon,#E9967A,"rgb(233,150,122)"
4,4,,lightcoral,#F08080,"rgb(240,128,128)"
5,5,,coral,#FF7F50,"rgb(255,127,80)"
6,6,,tomato,#FF6347,"rgb(255,99,71)"
7,7,,orangered,#FF4500,"rgb(255,69,0)"
8,8,,gold,#FFD700,"rgb(255,215,0)"
9,9,,orange,#FFA500,"rgb(255,165,0)"
