### Importing the packages

In [None]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [None]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Space_medicine"

# Making a get request
response = requests.get(base_site)
response

In [None]:
# Extracting the HTML
html = response.content

### Making the soup

In [None]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [None]:
# Find all links on the page 
links = soup.find_all('a')
links

In [None]:
# Dropping the links without 'href' attribute (removing links without URL or 'None')

clean_links = [l for l in links if l.get('href') != None]
clean_links

In [None]:
# Getting all titles

titles = [l.get('title') for l in clean_links]
titles

In [None]:
# Removing the 'None' titles

clean_titles = [t for t in titles if t != None]
clean_titles

### 2. Extract all heading 2 strings.

In [None]:
# Inspect all h2 tags
soup.find_all('h2')

In [None]:
# Get the text
h2_strings = [h2.string for h2 in soup.find_all('h2')]
h2_strings

In [None]:
# Oh no! Why it's mostly listed as None? :(
# OK let's check the span class then
soup.find_all(class_ = 'mw-headline')

In [73]:
# Get that span class text
cl_strings = [cl.string for cl in soup.find_all(class_ = 'mw-headline')]
cl_strings

['History',
 'Project Mercury',
 'Effects of space-travel',
 'Cardiac rhythms',
 'Decompression illness in spaceflight',
 'Decompression sickness',
 'Barotrauma',
 'Decreased immune system functioning',
 'Increased infection risk',
 'Effects of fatigue',
 'Loss of balance',
 'Loss of bone density',
 'Loss of muscle mass',
 'Loss of eyesight',
 "Loss of mental abilities and risk of Alzheimer's disease",
 'Orthostatic intolerance',
 'Radiation effects',
 'Sleep disorders',
 'Spaceflight analogues',
 'Space medicine careers',
 'Related degrees, areas of specialization, and certifications',
 'Space nursing',
 'Medicine in flight',
 'Ultrasound and space',
 'Space Shuttle era',
 'Future investigations',
 'Feasibility of Long Duration Space Flights',
 'Impact on science and medicine',
 'Pre-Mercury through Apollo',
 'Ultrasound microgravity',
 'See also',
 'References',
 'External links']

# Extracting the link data from the main page

In [None]:
# Now to extract all links marked as 'Main article:' or 'See also:'
# Inspect the page from the browser first and see in which tag the links are contained

div_notes = soup.find_all("div", {"role": "note"})
div_notes

In [None]:
div_notes[0]

In [None]:
# Locate the link inside the tag

div_notes[0].find('a')

In [None]:
# Use the naive approach to get all the links

div_links = [div.find('a') for div in div_notes]
div_links

In [None]:
# Check how many links we get

len(div_links)

In [None]:
# Check if some divs have more than 1 link, which they usually do

div_notes[6]

In [None]:
# Check how many links the previous div has

div_notes[6].find_all('a')

In [None]:
# So I need to use find_all to make sure I capture them all
# Let's use a for loop to contain all those grouped together links

# Define initially empty list of links
div_links = []

for div in div_notes:
    anchors = div.find_all('a')
    
    # Need to add every link from anchors to div_links
    for a in anchors:
        div_links.append(a)
        
    # Or I can use div_links.extend(anchors) instead of the for loop

In [None]:
div_links

In [None]:
# Recheck how many links we get

len(div_links)

In [None]:
# Let's get the complete URLs
# Use urljoin to obtain the absolute URL address and combine it with relative URL address

from urllib.parse import urljoin

note_urls = [urljoin(base_site, l.get('href')) for l in div_links]
note_urls

In [None]:
len(note_urls)

# Extracting all text from the note URLs

In [None]:
# Now I have all the note URLs, next I want to see the text inside them

note_urls

In [None]:
# To do that, extract all text contained in a paragraph element,
# for all paragraphs on a page,
# for all pages in note_urls

# Prepare the list to store paragraph text for each webpage
par_text = []

# Create a loop counter
i = 0

# Loop through each URL in note_urls
for url in note_urls:
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    # check if the request is successful
    
    # if everything is OK
    if note_resp.status_code == 200:
        # print out the number of iteration and the URL to keep track of place in loop
        print('URL #{0}: {1}'.format(i+1,url))
        
    # if something is wrong
    else:
        print('Status code{0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html,'lxml')
    
    # find all "p" tags on the webpage
    note_pars = note_soup.find_all("p")
    
    # get the text from each "p" tag
    text = [p.text for p in note_pars]
    
    # append text from each "p" tag to the par_text list
    par_text.append(text)
    
    # incrementing the loop counter
    i = i+1

In [None]:
# Inspect the result for the first page
par_text[0]

In [None]:
# Since the list contains all paragraph strings,
# let's have all the text as one string instead of a list of strings

# Merge all paragraphs of the first page into one long string
page_text = ["".join(text) for text in par_text]

#Inspect the result for some webpage
page_text[0]

In [None]:
# Create a dictionary with the (key,value) pairs being (url,text)
url_to_text = dict(zip(note_urls, page_text))

In [72]:
print(url_to_text['https://en.wikipedia.org/wiki/Project_Mercury'])


Project Mercury was the first human spaceflight program of the United States, running from 1958 through 1963.  An early highlight of the Space Race, its goal was to put a man into Earth orbit and return him safely, ideally before the Soviet Union.  Taken over from the US Air Force by the newly created civilian space agency NASA, it conducted twenty uncrewed developmental flights (some using animals), and six successful flights by astronauts. The program, which took its name from Roman mythology, cost $2.25 billion adjusted for inflation.[1][n 2] The astronauts were collectively known as the "Mercury Seven", and each spacecraft was given a name ending with a "7" by its pilot.
The Space Race began with the 1957 launch of the Soviet satellite Sputnik 1.  This came as a shock to the American public, and led to the creation of NASA to expedite existing US space exploration efforts, and place most of them under civilian control. After the successful launch of the Explorer 1 satellite in 195