# Objective
Scraping text from a Wikipedia website using Beautiful Soup

**Instructions**
After watching this video below, you will be able to:

[Scraping Wikipedia with Python and Beautiful Soup](https://www.youtube.com/watch?v=YY5skv756pc)

1. **Write a function to get and parse HTML content from a Wikipedia page.**

2. **Write a function to extract the article title.**

3. **Write a function to extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in a dictionary.**

4. **Write a function to collect every link that redirects to another Wikipedia page.**

5. **Wrap all the previous functions into a single function that takes a Wikipedia link as a parameter.**

6. **Test the final function on a Wikipedia page of your choice.**

In [None]:
import requests
from bs4 import BeautifulSoup

### 1. Write a function to Get and parse HTML content from a Wikipedia page

In [None]:
def get_html_content(url):
    response = requests.get(url)  # Send an HTTP GET request to the given URL
    if response.status_code == 200:  # Check if the request was successful
        return response.text  # Return the HTML content of the page
    else:
        return None  # Return None if the request was not successful

### 2. Write a function to Extract article title

In [None]:
def extract_title(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')  # Parse the HTML content
    title = soup.find('h1', id='firstHeading').text  # Find the title element and extract its text
    return title  # Return the title

### 3. Write a function to Extract article text for each paragraph with their respective headings

In [None]:
def extract_paragraphs_with_headings(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')  # Parse the HTML content
    content = soup.find('div', class_='mw-parser-output')  # Find the main content div
    sections = {}  # Initialize an empty dictionary to store sections

    current_heading = None  # Initialize the current heading as None
    for element in content.find_all(['h2', 'p']):  # Loop through all headings and paragraphs
        if element.name == 'h2':  # If the element is a heading
            current_heading = element.text.strip()  # Set the current heading
        elif element.name == 'p':  # If the element is a paragraph
            if current_heading:  # If there is a current heading
                if current_heading not in sections:
                    sections[current_heading] = []  # Initialize a list for the heading if not present
                sections[current_heading].append(element.text.strip())  # Append the paragraph to the heading
    return sections  # Return the dictionary of sections

### 4. Write a function to collect every link that redirects to another Wikipedia page

In [None]:
def extract_wikipedia_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')  # Parse the HTML content
    links = []  # Initialize an empty list to store links

    for link in soup.find_all('a', href=True):  # Loop through all anchor tags with href attribute
        href = link['href']
        if href.startswith('/wiki/'):  # Check if the link redirects to another Wikipedia page
            full_url = 'https://en.wikipedia.org' + href  # Construct the full URL
            links.append(full_url)  # Add the full URL to the list
    return links  # Return the list of links

### 5. Wrap all the previous functions into a single function that takes a Wikipedia link as a parameter

In [None]:
def process_wikipedia_page(url):
    html_content = get_html_content(url)  # Get and parse HTML content
    if html_content:
        title = extract_title(html_content)  # Extract article title
        sections = extract_paragraphs_with_headings(html_content)  # Extract paragraphs with headings
        links = extract_wikipedia_links(html_content)  # Extract Wikipedia links
        return {
            'title': title,
            'sections': sections,
            'links': links
        }  # Return a dictionary with the extracted information
    else:
        return None  # Return None if HTML content could not be retrieved

### 6. Test the last function on a Wikipedia page of your choice

In [None]:
url = 'https://en.wikipedia.org/wiki/Web_scraping'  # Specify the Wikipedia page URL
result = process_wikipedia_page(url)  # Process the Wikipedia page

# Print the extracted information
if result:
    print("Title:", result['title'])
    print("\nSections:")
    for heading, paragraphs in result['sections'].items():
        print(heading)
        for paragraph in paragraphs:
            print(paragraph)
        print()
    print("\nLinks:")
    for link in result['links']:
        print(link)
else:
    print("Failed to retrieve content from the Wikipedia page.")
