In [139]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

with open('htmpl.html', 'r') as file:
    # Read the content of the file
    html_content = file.read()

    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')


Exercise 1 : Parsing HTML With BeautifulSoup
--

In [140]:
soup.text

'\n\n\n\n\nSports World\n\n\n\n\nWelcome to Sports World\nYour one-stop destination for the latest sports news and videos.\n\n\nFootball\nBasketball\nTennis\n\n\nFootball\n\nLatest Football News\nRead about the latest football matches and player news.\n\n\n\n\n\n\n\nBasketball\n\nNBA Highlights\nWatch highlights from the latest NBA games.\n\n\n\n\n\n\n\nTennis\n\nGrand Slam Updates\nGet the latest updates from the world of Grand Slam tennis.\n\n\n\n\n\n\n\nName:\n\nEmail:\n\nMessage:\n\n\n\n\n\n'

In [141]:
title = soup.title
print("Title:", title.text)

Title: Sports World


In [142]:
# Extract all paragraphs from the page
paragraphs = soup.find_all('p')
for p in paragraphs:
    print("Paragraph:", p.text)

Paragraph: Your one-stop destination for the latest sports news and videos.
Paragraph: Read about the latest football matches and player news.
Paragraph: Watch highlights from the latest NBA games.
Paragraph: Get the latest updates from the world of Grand Slam tennis.


In [143]:
# Retrieve all links on the page
links = soup.find_all('a')
for link in links:
    href = link.get('href')
    if href:
        print("Link:", href)

Link: #football
Link: #basketball
Link: #tennis


Exercise 2 : Scraping Robots.Txt From Wikipedia
--

In [144]:
import requests

url_wiki_robot = 'https://en.wikipedia.org/robots.txt'

text_robot = requests.get(url_wiki_robot)

print(text_robot)



<Response [200]>


In [145]:
print(text_robot.text)

# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Z

Exercise 3 : Extracting Headers From Wikipedia’s Main Page
--

Write a Python program to extract and display all the header tags from en.wikipedia.org/wiki/Main_Page.

In [146]:
wiki_url = 'https://en.wikipedia.org/wiki/Main_Page'

response = requests.get(wiki_url)

soup_wiki_main = BeautifulSoup(response.text,'html.parser')

In [147]:
print(soup_wiki_main.find_all('head'))

[<head>
<meta charset="utf-8"/>
<title>Wikipedia, the free encyclopedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-not-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=classNa

In [148]:
header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

for tag in header_tags:
    print(tag.text.strip())

Welcome to Sports World
Football
Latest Football News
Basketball
NBA Highlights
Tennis
Grand Slam Updates


Exercise 4 : Checking For Page Title
---

Write a Python program to check whether a page contains a title or not.

In [149]:
def has_title(url):
    try:
        response = requests.get(url)

        soup = BeautifulSoup(response.content, 'html.parser')
        
        title_tag = soup.find('title')

        if title_tag:
            return True
        else:
            return False
    except Exception as e:
        print("An error occurred:", e)
        return False

has_title('https://en.wikipedia.org/wiki/Main_Page')

True

Exercise 5 : Analyzing US-CERT Security Alerts
Instructions
---

Write a Python program get the number of security alerts issued by US-CERT in the current year.
Source: https://www.us-cert.gov/ncas/alerts

Exercise 6 : Scraping Movie Details
Instructions
---

Write a Python program to get movie name, year and a brief summary of the top 10 random movies.



In [150]:
import pandas as pd
import random as rd

In [168]:
url_film_rating = 'https://www.imdb.com/list/ls534888914/?ref_=login'

resp = requests.get(url_film_rating)

rating_soup = BeautifulSoup(resp.content,'html.parser')


In [169]:
films =  rating_soup.find_all('div',{'class':'lister-item-content'})

In [170]:
movie_data = {
    'movie_name':[],
    'film_year':[],
    'summury':[],
    'director':[],
    "stars":[]
}

for film in rd.choices(films,k=10):
    film_name = film.find('a').text
    movie_data['movie_name'].append(film_name)
    film_year = film.find('span',{'class':"lister-item-year text-muted unbold"}).text[1:-1]
    movie_data['film_year'].append(film_year) 
    film_summury =  film.find('p',{'class':''}).text
    movie_data['summury'].append(film_summury[1:])
    paragraphs = film.find_all('p', {'class': 'text-muted text-small'})
    for paragraph in paragraphs:
        if 'Director:' in paragraph.text:
            director_link = paragraph.find('a')
            if director_link:
                director = director_link.text
            star_links = paragraph.find_all('a')[1:]  # All 'a' tags after the first, which is the director
            stars = [star.text for star in star_links]
            break  # Exit after finding the director and stars
    movie_data['director'].append(director)
    movie_data['stars'].append(stars)

In [171]:
df_movies = pd.DataFrame(movie_data)

In [172]:
df_movies

Unnamed: 0,movie_name,film_year,summury,director,stars
0,Sen to Chihiro no kamikakushi,2001,"During her family's move to the suburbs, a sul...",Hayao Miyazaki,"[Daveigh Chase, Suzanne Pleshette, Miyu Irino,..."
1,Witness for the Prosecution,1957,A veteran British barrister must defend his cl...,Billy Wilder,"[Tyrone Power, Marlene Dietrich, Charles Laugh..."
2,Witness for the Prosecution,1957,A veteran British barrister must defend his cl...,Billy Wilder,"[Tyrone Power, Marlene Dietrich, Charles Laugh..."
3,Aliens,1986,"Decades after surviving the Nostromo incident,...",James Cameron,"[Sigourney Weaver, Michael Biehn, Carrie Henn,..."
4,The Pianist,2002,"During WWII, acclaimed Polish musician Wladysl...",Roman Polanski,"[Adrien Brody, Thomas Kretschmann, Frank Finla..."
5,The Lord of the Rings: The Return of the King,2003,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O..."
6,Inglourious Basterds,2009,"In Nazi-occupied France during World War II, a...",Quentin Tarantino,"[Brad Pitt, Diane Kruger, Eli Roth, Mélanie La..."
7,Raiders of the Lost Ark,1981,"In 1936, archaeologist and adventurer Indiana ...",Steven Spielberg,"[Harrison Ford, Karen Allen, Paul Freeman, Joh..."
8,Once Upon a Time in the West,1968,A mysterious stranger with a harmonica joins f...,Sergio Leone,"[Henry Fonda, Charles Bronson, Claudia Cardina..."
9,Aliens,1986,"Decades after surviving the Nostromo incident,...",James Cameron,"[Sigourney Weaver, Michael Biehn, Carrie Henn,..."
