# Web Scraping and Data Extraction Using Python (Requests & BeautifulSoup)

In [80]:
# Step 1: Import the required modules and functions
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a webpage

In [81]:
# Step 2: Define the webpage URL to download
url = "http://www.ibm.com"

In [82]:
# Step 3: Get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text 

In [83]:
# Step 4: Create a soup object using the class BeautifulSoup
soup=BeautifulSoup(data, "html.parser")  # create a soup object using the variable 'data'

# Scrape all links

In [84]:
# Step 5: Scrape all links
for link in soup.find_all('a'):  # in html anchor/link is represented by the tag <a>
    print(link.get('href'))

https://www.ibm.com/thought-leadership/institute-business-value/en-us/report/business-trends-2026?lnk=hpls1us
https://www.ibm.com/think/insights/lets-create-smarter-business?lnk=hpls2us
https://skillsbuild.org/adult-learners/explore-learning/artificial-intelligence?lnk=hprc1us
https://www.ibm.com/quantum/blog/quantum-advantage-era?lnk=hprc2us
https://www.ibm.com/case-studies?lnk=hprc3us
https://www.ibm.com/think/insights/scale-ai-agents-business?lnk=hprc4us
https://www.ibm.com/case-studies/scuderia-ferrari?lnk=hpcs1us
https://www.ibm.com/case-studies/sixt?lnk=hprc2us
https://www.ibm.com/case-studies/us-open?lnk=hpcs3us
https://www.ibm.com/case-studies/ufc?lnk=hpcs4us
https://www.ibm.com/case-studies/avid-solutions-international?lnk=hpcs1us
https://www.ibm.com/case-studies/avid-solutions-international?lnk=hpcs5us
https://www.ibm.com/products/offers-and-discounts?lnk=hpdo1us
https://www.ibm.com/software?lnk=hpfp1us
https://www.ibm.com/solutions/ai-agents?lnk=hpfp2us
https://www.ibm.com/s

In [85]:
# Step 6: Extract all links, remove duplicates and unwanted links, and print the cleaned list
# 6.1: Collect all links into a list
links = []

for tag in soup.find_all("a"):   # <a> = hyperlink tag
    href = tag.get("href")       # get the URL value
    if href:                     # ignore empty values
        links.append(href)

print("Total links found (raw):", len(links))

# Step 6.2: Remove duplicate links using a set
unique_links = list(set(links))

print("Unique links after cleaning:", len(unique_links))

# Step 6.3: Remove junk links (#, javascript, mailto)----- Basic cleaning -----
clean_links = []

for link in unique_links:
    if link.startswith("#"):
        continue
    if link.startswith("javascript"):
        continue
    if link.startswith("mailto:"):
        continue
    clean_links.append(link)

print("Final cleaned links:", len(clean_links))
print("\n--- CLEANED LINKS ---")

for link in clean_links:
    print(link)
                     


Total links found (raw): 32
Unique links after cleaning: 32
Final cleaned links: 32

--- CLEANED LINKS ---
https://www.ibm.com/case-studies/avid-solutions-international?lnk=hpcs5us
https://www.ibm.com/careers?lnk=hpii1us
https://www.ibm.com/products/offers-and-discounts?lnk=hpdo1us
https://www.ibm.com/about?lnk=hpii1us
https://www.ibm.com/thought-leadership/institute-business-value/en-us/report/business-trends-2026?lnk=hpls1us
https://www.ibm.com/case-studies/ufc?lnk=hpcs4us
https://www.ibm.com/solutions/data-and-ai?lnk=hpfp3us
https://newsroom.ibm.com/2025-12-08-ibm-to-acquire-confluent-to-create-smart-data-platform-for-enterprise-generative-ai
https://www.ibm.com/solutions/hybrid-cloud?lnk=hpfp5us
https://skillsbuild.org?lnk=hpii1us
https://newsroom.ibm.com/2025-12-08-riyadh-air-and-ibm-partner-to-launch-worlds-first-ai-native-airline
https://skillsbuild.org/adult-learners/explore-learning/artificial-intelligence?lnk=hprc1us
https://newsroom.ibm.com/2025-12-05-ibm-designated-as-a-cri

# Key Insights:

“The raw URL list contained duplicates, tracking-parameter variations, and invalid values such as None.

# Key Insights:

cleaned list is:

✔️ Deduplicated

✔️ Free of invalid values (like None)

✔️ More meaningful for analysis

✔️ Closer to a canonical link list

# Categorizing Website Links by Business Themes (AI, Cloud, Careers)”

In [86]:
# Step 7: Filtered Link Groups by Topic: AI, Cloud, and Careers
# ----- TOPIC FILTERS -----
ai_links = []
cloud_links = []
careers_links = []

for link in clean_links:
    lower = link.lower()

    if "ai" in lower or "artificial-intelligence" in lower:
        ai_links.append(link)

    if "cloud" in lower or "hybrid-cloud" in lower:
        cloud_links.append(link)

    if "career" in lower or "jobs" in lower or "careers" in lower:
        careers_links.append(link)

# ----- PRINT RESULTS -----
print("\nAI Links:", len(ai_links))
for l in ai_links:
    print(l)

print("\nCloud Links:", len(cloud_links))
for l in cloud_links:
    print(l)

print("\nCareers Links:", len(careers_links))
for l in careers_links:
    print(l)


AI Links: 8
https://www.ibm.com/solutions/data-and-ai?lnk=hpfp3us
https://newsroom.ibm.com/2025-12-08-ibm-to-acquire-confluent-to-create-smart-data-platform-for-enterprise-generative-ai
https://newsroom.ibm.com/2025-12-08-riyadh-air-and-ibm-partner-to-launch-worlds-first-ai-native-airline
https://skillsbuild.org/adult-learners/explore-learning/artificial-intelligence?lnk=hprc1us
https://newsroom.ibm.com/2025-12-11-ibm-and-pearson-collaborate-to-build-new-ai-powered-learning-tools-for-organizations-and-individuals-worldwide
https://www.ibm.com/think/insights/scale-ai-agents-business?lnk=hprc4us
https://www.ibm.com/solutions/ai-agents?lnk=hpfp2us
https://www.ibm.com/solutions/ai-models?lnk=hpfp6us

Cloud Links: 1
https://www.ibm.com/solutions/hybrid-cloud?lnk=hpfp5us

Careers Links: 1
https://www.ibm.com/careers?lnk=hpii1us


# Scrape all images

In [87]:
# Step 8: Scrape all image URLs from the webpage
for link in soup.find_all('img'):# in html image is represented by the tag <img>
    print(link.get('src'))

https://assets.ibm.com/is/image/ibm/finance-operations?ts=1766211021706&dpr=off
https://assets.ibm.com/is/image/ibm/adobestock_582381075?ts=1766204720037&dpr=off
https://assets.ibm.com/is/image/ibm/ai4b206_newsletter_informationai_optimize?ts=1766064697449&dpr=off
https://assets.ibm.com/is/image/ibm/scaling-lines-green?ts=1766064701042&dpr=off
https://assets.ibm.com/is/image/ibm/ibm_ferrari_video_thumb_01?ts=1766064701879&dpr=off
https://assets.ibm.com/is/image/ibm/sixt-smarter-business-case-study-regular-medium-ls?ts=1766064702380&dpr=off
https://assets.ibm.com/is/image/ibm/us-open-crowd-court?ts=1766064702950&dpr=off
https://assets.ibm.com/is/image/ibm/1731592760476?ts=1766064703461&dpr=off
https://assets.ibm.com/is/image/ibm/avid-solutions-leadspace?ts=1766064704010&dpr=off
https://assets.ibm.com/is/image/ibm/big-blue-30-shopping-promotion-banner?ts=1766064704629&dpr=off
https://assets.ibm.com/is/image/ibm/watson-discovery_eye-connection_1x1_padding?ts=1766064710354&dpr=off


# Scrape data from html tables

In [88]:
# Step 9: Define the URL of the webpage that contains an HTML table

#The below URL contains a html table with data about colors and color codes.
URL = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"


In [89]:
# Step 10: Get the contents of the webpage in text format and store in a variable called data
data1 = requests.get(URL).text

In [90]:
# Step 11: Create a soup object using the class BeautifulSoup
soup = BeautifulSoup(data1,"html.parser")

In [91]:
# Step 12: Find a html table in the web page
table = soup.find('table') # in html table is represented by the tag <table>

# Get all rows from the table

In [92]:
# Step 13: Extract color names and color codes from the HTML table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].getText() # store the value in column 3 as color_name
    color_code = cols[3].getText() # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF
