### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [3]:
# Extracting the HTML
html = response.content

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [5]:
# Find all links on the page 
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-hei

In [6]:
# Notice that some links don't have URL (None appears)

# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Music_lesson_Staatliche_Antikensammlungen_2421.jpg"><img alt="Music lesson Staatliche Antikensammlungen 2421.jpg" data-file-height="1849" data-fil

In [7]:
# Getting all titles of links using a list comprehension
titles = [l.get('title') for l in clean_links]
titles

['This article is semi-protected.',
 None,
 None,
 'Music (disambiguation)',
 None,
 'Paleolithic',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 None,
 'Culture',
 'Definitions of music',
 'Pitch (music)',
 'Melody',
 'Harmony',
 'Rhythm',
 'Tempo',
 'Meter (music)',
 'Articulation (music)',
 'Dynamics (music)',
 'Timbre',
 'Texture (music)',
 'Music genre',
 'Musical instrument',
 'Rapping',
 'Instrumental music',
 'A capella',
 'Accompaniment',
 'Greek language',
 'wikt:μουσική',
 'Muse',
 None,
 'Glossary of musical terminology',
 'Culture',
 'Song',
 'Symphony',
 'Music criticism',
 'Musicology',
 'Aesthetics of music',
 'Greek philosophy',
 'Ancient philosophy',
 'Harmony of the spheres',
 'John Cage',
 'Noise',
 None,
 'L

In [8]:
# Removing the 'None' titles
clean_titles = [t for t in titles if t != None]
clean_titles

['This article is semi-protected.',
 'Music (disambiguation)',
 'Paleolithic',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 'Culture',
 'Definitions of music',
 'Pitch (music)',
 'Melody',
 'Harmony',
 'Rhythm',
 'Tempo',
 'Meter (music)',
 'Articulation (music)',
 'Dynamics (music)',
 'Timbre',
 'Texture (music)',
 'Music genre',
 'Musical instrument',
 'Rapping',
 'Instrumental music',
 'A capella',
 'Accompaniment',
 'Greek language',
 'wikt:μουσική',
 'Muse',
 'Glossary of musical terminology',
 'Culture',
 'Song',
 'Symphony',
 'Music criticism',
 'Musicology',
 'Aesthetics of music',
 'Greek philosophy',
 'Ancient philosophy',
 'Harmony of the spheres',
 'John Cage',
 'Noise',
 'Ludwig van Beethoven',
 'Grosse Fuge',
 'St

### 2. Extract all heading 2 strings.

In [9]:
# Inspect all h2 tags
soup.find_all('h2')

[<h2>Contents</h2>,
 <h2><span class="mw-headline" id="Etymology">Etymology</span></h2>,
 <h2><span class="mw-headline" id="As_a_form_of_art_or_entertainment">As a form of art or entertainment</span></h2>,
 <h2><span class="mw-headline" id="Elements">Elements</span></h2>,
 <h2><span class="mw-headline" id="History">History</span></h2>,
 <h2><span class="mw-headline" id="Performance">Performance</span></h2>,
 <h2><span class="mw-headline" id="Philosophy_and_aesthetics">Philosophy and aesthetics</span></h2>,
 <h2><span class="mw-headline" id="Psychology">Psychology</span></h2>,
 <h2><span class="mw-headline" id="Sociological_aspects">Sociological aspects</span></h2>,
 <h2><span class="mw-headline" id="Media_and_technology">Media and technology</span></h2>,
 <h2><span class="mw-headline" id="Business">Business</span></h2>,
 <h2><span class="mw-headline" id="Education">Education</span></h2>,
 <h2><span class="mw-headline" id="Music_therapy">Music therapy</span></h2>,
 <h2><span class="mw-h

In [10]:
# Get the text
h2_strings = [h2.string for h2 in soup.find_all('h2')]
h2_strings

['Contents',
 'Etymology',
 'As a form of art or entertainment',
 'Elements',
 'History',
 'Performance',
 'Philosophy and aesthetics',
 'Psychology',
 'Sociological aspects',
 'Media and technology',
 'Business',
 'Education',
 'Music therapy',
 'See also',
 'References',
 'Further reading',
 'External links',
 'Navigation menu']

### 3. Print the whole footer text.

In [11]:
# By inspection: we see that the footer is contained inside a 'div' tag with id set to 'footer'
print(soup.find('div', id = 'footer').text)



 This page was last edited on 23 December 2019, at 04:25 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike License;
additional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Developers
Statistics
Cookie statement
Mobile view



 

 



