### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Space_medicine"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [3]:
# Extracting the HTML
html = response.content

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [5]:
# Find all links on the page 
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/Wikipedia:Identifying_reliable_sources_(medicine)" title="Wikipedia:Identifying reliable sources (medicine)">medical references</a>,
 <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Primary_sources" title="Wikipedia:Primary sources">primary sources</a>,
 <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Space_medicine&amp;action=edit">add the appropriate references</a>,
 <a href="/wiki/Wikipedia:Verifiability#Burden_of_evidence" title="Wikipedia:Verifiability">removed</a>,
 <a class="external text" href="//www.google.com/search?as_eq=wikipedia&amp;q=%22Space+medicine%22" rel="nofollow">"Space medicine"</a>,
 <a class="external text" href="//www.google.com/search?tbm=nws&amp;q=%22Space+medicine%22+-wikipedia" rel="nof

In [7]:
# Dropping the links without 'href' attribute (removing links without URL or 'None')

clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/Wikipedia:Identifying_reliable_sources_(medicine)" title="Wikipedia:Identifying reliable sources (medicine)">medical references</a>,
 <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Primary_sources" title="Wikipedia:Primary sources">primary sources</a>,
 <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Space_medicine&amp;action=edit">add the appropriate references</a>,
 <a href="/wiki/Wikipedia:Verifiability#Burden_of_evidence" title="Wikipedia:Verifiability">removed</a>,
 <a class="external text" href="//www.google.com/search?as_eq=wikipedia&amp;q=%22Space+medicine%22" rel="nofollow">"Space medicine"</a>,
 <a class="external text" href="//www.google.com/search?tbm=nws&amp;q=%22Space+medicine%22+-wikipedia" rel="nofollow">news</a>,
 <

In [8]:
# Getting all titles

titles = [l.get('title') for l in clean_links]
titles

[None,
 None,
 'Wikipedia:Identifying reliable sources (medicine)',
 'Wikipedia:Verifiability',
 'Wikipedia:Primary sources',
 None,
 'Wikipedia:Verifiability',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'Enlarge',
 'Dan Burbank',
 'Anton Shkaplerov',
 'Destiny laboratory',
 'International Space Station',
 None,
 'Medicine',
 'Astronauts',
 'Outer space',
 'Astronautical hygiene',
 'Blindness',
 'Osteoporosis',
 'Human spaceflight',
 None,
 None,
 'NASA Office of Inspector General',
 'Effect of spaceflight on the human body',
 'Space exploration',
 'Human mission to Mars',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'Edit section: History',
 'Hubertus Strughold',
 'Nazi',
 'United States',
 'World War II',
 'Operation Paperclip',
 None,
 'School of Aviation Medicine (page d

In [10]:
# Removing the 'None' titles

clean_titles = [t for t in titles if t != None]
clean_titles

['Wikipedia:Identifying reliable sources (medicine)',
 'Wikipedia:Verifiability',
 'Wikipedia:Primary sources',
 'Wikipedia:Verifiability',
 'Enlarge',
 'Dan Burbank',
 'Anton Shkaplerov',
 'Destiny laboratory',
 'International Space Station',
 'Medicine',
 'Astronauts',
 'Outer space',
 'Astronautical hygiene',
 'Blindness',
 'Osteoporosis',
 'Human spaceflight',
 'NASA Office of Inspector General',
 'Effect of spaceflight on the human body',
 'Space exploration',
 'Human mission to Mars',
 'Edit section: History',
 'Hubertus Strughold',
 'Nazi',
 'United States',
 'World War II',
 'Operation Paperclip',
 'School of Aviation Medicine (page does not exist)',
 'Randolph Air Force Base',
 'Texas',
 'Department of Space Medicine (page does not exist)',
 'United States Air Force School of Aerospace Medicine',
 'Pressure suit',
 'Aerospace Medical Association',
 'Nuremberg Trials',
 'Dachau concentration camp',
 'Edit section: Project Mercury',
 'Project Mercury',
 'Project Mercury',
 'Edit

### 2. Extract all heading 2 strings.

In [41]:
# Inspect all h2 tags
soup.find_all('h2')

[<h2 id="mw-toc-heading">Contents</h2>,
 <h2><span class="mw-headline" id="History">History</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Space_medicine&amp;action=edit&amp;section=1" title="Edit section: History">edit</a><span class="mw-editsection-bracket">]</span></span></h2>,
 <h2><span class="mw-headline" id="Effects_of_space-travel">Effects of space-travel</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Space_medicine&amp;action=edit&amp;section=3" title="Edit section: Effects of space-travel">edit</a><span class="mw-editsection-bracket">]</span></span></h2>,
 <h2><span class="mw-headline" id="Spaceflight_analogues">Spaceflight analogues</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Space_medicine&amp;action=edit&amp;section=19" title="Edit section: Spaceflight analogues">edit</a><span class="mw-edi

In [42]:
# Get the text
h2_strings = [h2.string for h2 in soup.find_all('h2')]
h2_strings

['Contents',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'Navigation menu']

In [26]:
# Oh no! Why it's mostly listed as None? :(
# OK let's check the span class then
soup.find_all(class_ = 'mw-headline')

[<span class="mw-headline" id="History">History</span>,
 <span class="mw-headline" id="Project_Mercury">Project Mercury</span>,
 <span class="mw-headline" id="Effects_of_space-travel">Effects of space-travel</span>,
 <span class="mw-headline" id="Cardiac_rhythms">Cardiac rhythms</span>,
 <span class="mw-headline" id="Decompression_illness_in_spaceflight">Decompression illness in spaceflight</span>,
 <span class="mw-headline" id="Decompression_sickness">Decompression sickness</span>,
 <span class="mw-headline" id="Barotrauma">Barotrauma</span>,
 <span class="mw-headline" id="Decreased_immune_system_functioning">Decreased immune system functioning</span>,
 <span class="mw-headline" id="Increased_infection_risk">Increased infection risk</span>,
 <span class="mw-headline" id="Effects_of_fatigue">Effects of fatigue</span>,
 <span class="mw-headline" id="Loss_of_balance">Loss of balance</span>,
 <span class="mw-headline" id="Loss_of_bone_density">Loss of bone density</span>,
 <span class="mw

In [43]:
# Get that span class text
cl_strings = [cl.string for cl in soup.find_all(class_ = 'mw-headline')]
cl_strings

['History',
 'Project Mercury',
 'Effects of space-travel',
 'Cardiac rhythms',
 'Decompression illness in spaceflight',
 'Decompression sickness',
 'Barotrauma',
 'Decreased immune system functioning',
 'Increased infection risk',
 'Effects of fatigue',
 'Loss of balance',
 'Loss of bone density',
 'Loss of muscle mass',
 'Loss of eyesight',
 "Loss of mental abilities and risk of Alzheimer's disease",
 'Orthostatic intolerance',
 'Radiation effects',
 'Sleep disorders',
 'Spaceflight analogues',
 'Space medicine careers',
 'Related degrees, areas of specialization, and certifications',
 'Space nursing',
 'Medicine in flight',
 'Ultrasound and space',
 'Space Shuttle era',
 'Future investigations',
 'Feasibility of Long Duration Space Flights',
 'Impact on science and medicine',
 'Pre-Mercury through Apollo',
 'Ultrasound microgravity',
 'See also',
 'References',
 'External links']

### 3. Print the whole footer text.

In [46]:
# By inspection: we see that the footer is contained inside a ...
print(soup.find('div', id = 'footer').text)



 This page was last edited on 5 April 2020, at 15:56 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike License;
additional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Developers
Statistics
Cookie statement
Mobile view







