In [25]:
!pip install requests beautifulsoup4 pandas



🌟 Exercise 1 : Parsing HTML with BeautifulSoup
Instructions
Objective: Use urlopen() to fetch the HTML content of a webpage and then parse it using BeautifulSoup.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# Read the HTML content of the page.

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>
"""

In [4]:
#Create a BeautifulSoup object to parse this HTML.

soup = BeautifulSoup(html, 'html.parser')

type(soup) #returns the type of the variable soup
soup


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Sports World</title>
<style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>
<header>
<h1>Welcome to Sports World</h1>
<p>Your one-stop destination for the latest sports news and videos.</p>
</header>
<nav>
<a href="#football">Football</a>
<a href="#basketball">Basketball</a>
<a href="#tennis">Tennis</a>
</nav>
<section id="football">
<h2>Football</h2>
<article>
<h3>Latest Football News</h3>
<p>Read about the latest football matches and player news.</p>
<div class="video">
<iframe allowf

In [5]:
#Find the title of the webpage (the content inside the <title> tag).
page_title = soup.title.string
print("Title of the page:", page_title)

Title of the page: Sports World


In [8]:
#Extract all paragraphs (<p> tags) from the pag

paragraphs = soup.find_all('p')
for i, paragraph in enumerate(paragraphs, start=1):
    print(f"Paragraph {i}: {paragraph.text}")


Paragraph 1: Your one-stop destination for the latest sports news and videos.
Paragraph 2: Read about the latest football matches and player news.
Paragraph 3: Watch highlights from the latest NBA games.
Paragraph 4: Get the latest updates from the world of Grand Slam tennis.


In [9]:
#Retrieve all links (URLs in <a href=""> tags) on the page.

links = soup.find_all('a')
for i, link in enumerate(links, start=1):
    href = link.get('href')
    print(f"Link {i}: {href}")

Link 1: #football
Link 2: #basketball
Link 3: #tennis


🌟 Exercise 2 : Scraping robots.txt from Wikipedia
Instructions
Write a Python program to download and display the content of robot.txt for wikipedia

In [11]:
robots_url = "https://en.wikipedia.org/robots.txt"
response = requests.get(robots_url)

if response.status_code == 200:
    robots_content = response.text
    print("Content of robots.txt:")
    print(robots_content)

Content of robots.txt:
﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Di

🌟 Exercise 3 : Extracting Headers from Wikipedia’s Main Page
Instructions
Write a Python program to extract and display all the header tags from wikipedia.

In [12]:
wiki = "https://en.wikipedia.org/wiki/Main_Page"
response = requests.get(wiki)
soup = BeautifulSoup(response.text, 'html.parser')

In [14]:
headers_wiki = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for i, header in enumerate(headers_wiki, start=1):
    print(f"Header {i}: {header.text}")

Header 1: Main Page
Header 2: Welcome to Wikipedia
Header 3: From today's featured article
Header 4: Did you know ...
Header 5: In the news
Header 6: On this day
Header 7: From today's featured list
Header 8: Today's featured picture
Header 9: Other areas of Wikipedia
Header 10: Wikipedia's sister projects
Header 11: Wikipedia languages


🌟 Exercise 4 : Checking for Page Title
Instructions
Write a Python program to check whether a page contains a title or not.

In [15]:
url = "https://en.wikipedia.org/wiki/Main_Page"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

title = soup.title
if title:
    print("Title of the page:", title.string)
else:
    print("No title found on the page.")

Title of the page: Wikipedia, the free encyclopedia


🌟 Exercise 5 : Analyzing US-CERT Security Alerts
Instructions
Write a Python program to get the number of security alerts issued by US-CERT in the current year.
Source



In [22]:
url = "https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

if response.status_code == 200:
    print("Successfully fetched the webpage.")
else:
    print("Failed to fetch the webpage.")


Successfully fetched the webpage.


In [23]:

def parsing(response):
  if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        alerts = soup.find_all("div", class_="c-teaser__date")
        alerts_2024 = [alert for alert in alerts if "2024" in alert.text]
        return len(alerts_2024)
  else:
        print("No results found")
        return 0


parsing(response)

10

In [24]:
# parse all the pages
sum_alerts = 0

for i in range(0, 50):
    url = f"https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&page={i}"
    response = requests.get(url)

    alerts_page = parsing(response)
    sum_alerts += alerts_page
    print(f"Alerts on page {i}: {alerts_page}")

print(f"Total number of alerts in 2024: {sum_alerts}")


Alerts on page 0: 10
Alerts on page 1: 10
Alerts on page 2: 10
Alerts on page 3: 10
Alerts on page 4: 10
Alerts on page 5: 10
Alerts on page 6: 10
Alerts on page 7: 10
Alerts on page 8: 10
Alerts on page 9: 10
Alerts on page 10: 10
Alerts on page 11: 10
Alerts on page 12: 10
Alerts on page 13: 10
Alerts on page 14: 10
Alerts on page 15: 10
Alerts on page 16: 10
Alerts on page 17: 10
Alerts on page 18: 10
Alerts on page 19: 10
Alerts on page 20: 10
Alerts on page 21: 10
Alerts on page 22: 10
Alerts on page 23: 10
Alerts on page 24: 10
Alerts on page 25: 10
Alerts on page 26: 10
Alerts on page 27: 10
Alerts on page 28: 10
Alerts on page 29: 10
Alerts on page 30: 10
Alerts on page 31: 10
Alerts on page 32: 10
Alerts on page 33: 10
Alerts on page 34: 10
Alerts on page 35: 10
Alerts on page 36: 6
Alerts on page 37: 0
Alerts on page 38: 0
Alerts on page 39: 0
Alerts on page 40: 0
Alerts on page 41: 0
Alerts on page 42: 0
Alerts on page 43: 0
Alerts on page 44: 0
Alerts on page 45: 0
Alerts o