# Web Scraping with Python


In [1]:
import pandas as pd
URL = 'https://en.wikipedia.org/wiki/List_of_largest_banks'
tables = pd.read_html(URL)
df = tables[0]
print(df)

    Rank                                Bank name  \
0      1  Industrial and Commercial Bank of China   
1      2               Agricultural Bank of China   
2      3                  China Construction Bank   
3      4                            Bank of China   
4      5                           JPMorgan Chase   
..   ...                                      ...   
95    96                            Handelsbanken   
96    97                 Industrial Bank of Korea   
97    98                                      DNB   
98    99                      Qatar National Bank   
99   100                  National Bank of Canada   

    Total assets (2024) (US$ billion)  
0                             6303.44  
1                             5623.12  
2                             5400.28  
3                             4578.28  
4                             4002.81  
..                                ...  
95                             351.79  
96                             345.81  
97 

In [3]:
!pip install bs4
!pip install requests pandas

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
from bs4 import BeautifulSoup
import requests

In [5]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [6]:
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [8]:
soup = BeautifulSoup(html, "html.parser")

In [9]:
# prettify() to display the HTML in the nested structure:
print(soup)

<bound method Tag.prettify of <!DOCTYPE html>
<html><head><title>Page Title</title></head><body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>>


## Tag

In [None]:
tag_object = soup.title
print("tag: ", tag_object)
print("type: ", type(tag_object))

In [19]:
tag_object = soup.h3
print("Head", tag_object)

Head <h3><b id="boldest">Lebron James</b></h3>


## Children, Parents and  Siblings

In [20]:
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [21]:
parent_tag = tag_child.parent
parent_tag

<h3><b id="boldest">Lebron James</b></h3>

In [23]:
sibling_1 = tag_object.next_sibling
sibling_1

<p> Salary: $ 92,000,000 </p>

In [26]:
sibling2 = sibling_1.next_sibling
sibling2

<h3> Stephen Curry</h3>

## HTML Attributes

In [27]:
tag_child["id"]

'boldest'

In [29]:
#access that dictionary directly as attrs:
tag_child.attrs

{'id': 'boldest'}

In [30]:
tag_child.get("id")

'boldest'

## Navigable String

In [31]:
tag_str = tag_child.string
tag_str

'Lebron James'

In [32]:
type(tag_str)

bs4.element.NavigableString

In [33]:
unicodeStr = str(tag_str)
unicodeStr

'Lebron James'

## Filter

In [34]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
    <td>80 kg</td>
  </tr>
</table>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg


In [35]:
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [37]:
tableBS = BeautifulSoup(table, "html.parser") # "html.parser" tells beautifulsoup how to read and understand HTML code we gave it

## Find All

In [39]:
table_rows = tableBS.find_all("tr")
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>]

In [41]:
frist_row = table_rows[0]
print(frist_row)

print(type(frist_row))

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
<class 'bs4.element.Tag'>


In [43]:
# combind children
frist_row.td

<td id="flight">Flight No</td>

In [44]:
for i, row in enumerate(table_rows):
    print("row", i, "is", row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>


In [45]:
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td>
colunm 2 cell <td>80 kg</td>


In [47]:
list_input=tableBS .find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td>,
 <td>80 kg</td>]

## Attributes

In [49]:
tableBS.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [50]:
list_input=tableBS.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a>]

In [51]:
tableBS.find_all(href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a>]

In [52]:
#Using the soup object soup, find the element with the id attribute content set to "boldest".
soup.find_all(id = "boldest")

[<b id="boldest">Lebron James</b>]

## string

In [53]:
tableBS.find_all(string="Florida")

['Florida', 'Florida']

## Find

- The find_all() method scans the entire document looking for results.
- find() method to find the first element in the document.

In [54]:
%%html
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party  </h3>
  
    
<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td> 
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg

0,1,2
Pizza Place,Orders,Slices
Domino's Pizza,10,100
Little Caesars,12,144
Papa John's,15,165


In [55]:
two_tables="<h3>Rocket Launch </h3><p><table class='rocket'><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p><p><h3>Pizza Party  </h3><table class='pizza'><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td >144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr>"

In [58]:
two_tables_BS = BeautifulSoup(two_tables,"html.parser")

In [59]:
# find 1st table
two_tables_BS.find("table")

<table class="rocket"><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table>

In [61]:
# fillter on class attributes to find 2nd table
two_tables_BS.find("table",class_="pizza")

<table class="pizza"><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td>144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr></table>

## Downloading And Scraping The Contents Of A Web Page

In [64]:
url = "http://www.ibm.com"

# use get to download content
data = requests.get(url).text

#crate BeatifulSoup object
soup = BeautifulSoup(data, "html.parser")

# Screpe all links
for link in soup.find_all("a", href=True):
    print(link.get("href"))

https://www.ibm.com/sports/wimbledon?lnk=hpls1us
https://www.ibm.com/think/news/engaging-wimbledon-fans-ibm-watsonx?lnk=hpls2us
https://community.ibm.com/community/user/blogs/brandon-pederson1/2025/06/26/meet-the-ibm-power11-family?lnk=hprc4us
https://www.ibm.com/case-studies/all-england-lawn-tennis-club-ibm-ix?lnk=hprc1us
https://www.ibm.com/products/watsonx-data?lnk=hprc2us
https://www.ibm.com/campaign/2024/ai-skills-tennis?lnk=hprc3us
https://www.ibm.com/granite?lnk=hpdev1us
https://developer.ibm.com/technologies/artificial-intelligence/?lnk=hpdev2us
https://skillsbuild.org/?lnk=hpdev3us
https://www.ibm.com/new/announcements/agentic-ai-governance-evaluation-and-lifecycle?lnk=hpdev4us
https://www.ibm.com/new/announcements/ibm-named-a-leader-in-the-2025-gartner-magic-quadrant-for-data-science-and-machine-learning-platforms?lnk=hpdev5us
https://www.ibm.com/new/announcements/ibm-leader-2025-omdia-universe-on-no-low-pro-ide-assistants-report?lnk=hpdev6us
https://www.ibm.com/products/wats

## Scrape all images Tags

In [65]:
for link in soup.find_all("img"): # This finds the <img> tag
    print(link)
    print(link.get("src")) ## This prints: image.png

<img alt="Ruby Zgabay  with Power11 S1122 server " class="cmp-image__image" height="6294" itemprop="contentUrl" loading="lazy" src="https://assets.ibm.com/is/image/ibm/202505_power11_mahaney_atx_s1122-ls2_1122?ts=1751986655725&amp;dpr=off" srcset="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" title="Technician working on server hardware" width="9441"/>
https://assets.ibm.com/is/image/ibm/202505_power11_mahaney_atx_s1122-ls2_1122?ts=1751986655725&dpr=off
<img alt="Illustration of a multiple tennis balls going over a net" class="cmp-image__image" height="1920" itemprop="contentUrl" loading="lazy" src="https://assets.ibm.com/is/image/ibm/ibm_wimbledon_2d_additional_p_1x1?ts=1751986659305&amp;dpr=off" srcset="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="1920"/>
https://assets.ibm.com/is/image/ibm/ibm_wimbledon_2d_additional_p_1x1?ts=1751986659305&dpr=off
<img alt="Illutration of many tennis balls organizing into a single c

## Scrape data from HTML tables

In [66]:
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [76]:
# get data content
data = requests.get(url).text

In [77]:
# creat soup object
soup = BeautifulSoup(data, "html.parser") #converts raw HTML data into a format Python can understand

In [78]:
# find table
table = soup.find("table")

In [80]:
# get all row from table
for row in table.find_all("tr"):

    # get column in each row
    cols = row.find_all("td")
    color_name = cols[2].string
    color_code = cols[3].string
    print("{}--->{}".format(color_name, color_code))

Color Name--->None
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF
