In [378]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from urllib.request import urlopen
import requests
import pprint

# Exercise 1: Parsing HTML with BeautifulSoup

In [379]:
# Load URL using urlopen
url = 'https://octopus.developers.institute/courses/collection/125/course/650/section/1764/chapter/3432'
response = urlopen(url).read()
soup = BeautifulSoup(response, 'html.parser')

# Check
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <!-- Basic Page Needs
  <title>
   Developers Institute
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Developers Institute - Global Software Development Bootcamp" name="description"/>
  <!-- Favicon -->
  <link href="https://di1earning022025prod.s3.amazonaws.com/static/dynamic_campus_app/TBL_Dynamic_Content_Manager/logo_icon_file/logo_di.png" rel="icon" type="image/png"/>
  <!-- CSS 
  <link href="https://di1earning022025prod.s3.amazonaws.com/theme/css/framework.css" rel="stylesheet"/>
  <link href="https://di1earning022025prod.s3.amazonaws.com/theme/css/style.css" rel="stylesheet"/>
  <link href="https://di1earning022025prod.s3.amazonaws.com/theme/css/night-mode.css" rel="stylesheet"/>
  <link href="https://di1earning022025prod.s3.amazonaws.com/theme/css/bootstrap.css" rel="stylesheet"/>
  <link href="//maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.m

In [380]:
html = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>"""

# Load as a soup object
soup = BeautifulSoup(html, 'html.parser')

# Find the Title of the Webpage
print("Title:", soup.title.get_text())

# Extract all Paragraphs
paragraphs = soup.find_all("p")
for i, p in enumerate(paragraphs):
    print(f"P{i+1}: {p.get_text()}")

# Retrieve all links (<a href="">)
links = soup.find_all("a")
for l in links:
    print(l.get('href'))

Title: Sports World
P1: Your one-stop destination for the latest sports news and videos.
P2: Read about the latest football matches and player news.
P3: Watch highlights from the latest NBA games.
P4: Get the latest updates from the world of Grand Slam tennis.
#football
#basketball
#tennis


# Exercise 2: Scraping robots.txt from Wikipedia

In [381]:
url = 'https://en.wikipedia.org/robots.txt'
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36 "
        "(compatible; MyWikipediaScraper/1.0; +https://example.com/bot-info)"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}

# Using Response
response = requests.get(url=url, headers=headers)
print(response.text)

# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Z

# Exercise 3: Extracting Headers from Wikipedia’s Main Page

In [382]:
url = 'https://en.wikipedia.org/'
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36 "
        "(compatible; MyWikipediaScraper/1.0; +https://example.com/bot-info)"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}

# Load Soup Object
response = requests.get(url, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')
# print(soup.prettify())

# Extract and Display All Header Tags
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

for t in tags:
    if soup.find_all(t):
        print(t, [h.get_text() for h in soup.find_all(t)])
    else:
        continue

# OR
heads = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
heads


h1 ['Main Page', 'Welcome to Wikipedia']
h2 ["From today's featured article", 'Did you know\xa0...', 'In the news', 'On this day', "Today's featured picture", 'Other areas of Wikipedia', "Wikipedia's sister projects", 'Wikipedia languages']


[<h1 class="firstHeading mw-first-heading" id="firstHeading" style="display: none"><span class="mw-page-title-main">Main Page</span></h1>,
 <h1 id="Welcome_to_Wikipedia">Welcome to <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a></h1>,
 <h2 class="mp-h2 mw-html-heading" id="mp-tfa-h2">From today's featured article</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-dyk-h2">Did you know ...</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-itn-h2">In the news</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-otd-h2">On this day</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-tfp-h2">Today's featured picture</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-other">Other areas of Wikipedia</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-sister">Wikipedia's sister projects</h2>,
 <h2 class="mp-h2 mw-html-heading" id="mp-lang">Wikipedia languages</h2>]

# Exercise 4: Checking for Page Title

In [383]:
# Write a Python Program to check whether a page contains a title or not
def has_title(url):
    response = requests.get(url).text
    soup = BeautifulSoup(response, 'html.parser')
    if soup.title:
        return True
    return False

has_title('https://chatgpt.com/')

False

# Exercise 5: Analyzing US-CERT Security Alerts

In [384]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Load Selenium Driver Object
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Load the webpage
url = 'https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93'
driver.get(url)

wait = WebDriverWait(driver, 20)
alert_list = []

stop_scraping = False  # flag to stop all loops

while True:
    # Wait for and retrieve all alert elements on the current page
    alert_elements = wait.until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "c-teaser__row"))
    )

    # Loop through alerts on the current page
    for a in alert_elements:
        date_text = a.find_element(By.CLASS_NAME, 'c-teaser__date').text
        year = date_text[-4:]
        if year == '2025':
            alert_list.append(date_text)
        elif year == '2024':
            stop_scraping = True
            break  # stop inner loop

    if stop_scraping:
        print("Reached 2024 alerts — stopping scrape.")
        break  # stop outer loop too

    # Try to find the Next button
    try:
        next_button = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.c-pager__item--next a'))
        )
    except:
        print("No next page button found — stopping.")
        break

    # Click Next
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
    driver.execute_script("arguments[0].click();", next_button)

    # Wait until the old alerts are stale (page changed)
    wait.until(EC.staleness_of(alert_elements[0]))

# Show final count
print(f"\nNumber of 2025 alerts found: {len(alert_list)}")

driver.quit()


Reached 2024 alerts — stopping scrape.

Number of 2025 alerts found: 237


# Exercise 6 : Scraping Movie Details
Write a Python program to get movie name, year and a brief summary of the top 10 random movies

In [385]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/122.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(options=options)

url = 'https://www.imdb.com/list/ls091294718/'
driver.get(url)

wait = WebDriverWait(driver, 10)  # Give extra time

In [387]:
# Load Top 10 Movies from Webpage
movies = driver.find_elements(By.CLASS_NAME, "dli-parent")[:10]

# Extract Title
for m in movies:
    title = m.find_element(By.CLASS_NAME, "ipc-title__text--reduced").text.strip()
    year = m.find_elements(By.CLASS_NAME, "dli-title-metadata")[0].text[:4]
    summary = m.find_element(By.CLASS_NAME, "title-description-plot-container").text
    print(f"{title} - {year} - {summary}")


1. The Thing - 1982 - A research team in Antarctica is hunted by a shape-shifting alien that assumes the appearance of its victims.
2. American Psycho - 2000 - A wealthy New York City investment banking executive, Patrick Bateman, hides his alternate psychopathic ego from his co-workers and friends as he delves deeper into his violent, hedonistic fantasies.
3. Jaws - 1975 - When a massive killer shark unleashes chaos on a beach community off Long Island, it's up to the local police chief, a marine biologist, and an old seafarer to hunt the beast down.
4. The Evil Dead - 1981 - Five friends travel to a cabin in the woods, where they unknowingly release flesh-possessing demons.
5. Top Gun - 1986 - The Top Gun Naval Fighter Weapons School is where the best of the best train to refine their elite flying skills. When hotshot fighter pilot Maverick is sent to the school, his reckless attitude and cocky demeanor put him at odds with the other pilots, especially the cool and collected Iceman.
