In [2]:
# Importing relevant libraries
import requests as req
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [3]:
# Link to the wikipedia page with data required 
link = "https://en.wikipedia.org/wiki/Mountain_pigeon"

# Connect to the HTML page via the link and get content from the HTML page
response = req.get(link)
if response.status_code != 200:
    raise Exception(f"Failed to load page: {response.status_code}")
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
def get_article_titles(soup):
    """Finds all the article titles in the HTML page."""
    titles = [title.text.strip() for title in soup.find_all('h2')]
    print("Article Titles:")
    for title in titles:
        print(f"- {title}")
    return titles

In [5]:
def get_article_text(soup):
    """Extracts article text for each paragraph with their respective headings."""
    content = {}
    main_content = soup.find('div', {'id': 'bodyContent'})

    if main_content:
        for section in main_content.find_all(['h2', 'h3']):
            heading = section.text.strip()
            paragraphs = []
            for sibling in section.find_next_siblings():
                if sibling.name in ['h2', 'h3']:
                    break
                if sibling.name == 'p':
                    paragraphs.append(sibling.text.strip())
            content[heading] = ' '.join(paragraphs)
            print(f"\nHeading: {heading}")
            print(f"Content: {content[heading][:100]}...")  # Display first 100 characters
    return content

In [6]:
def extract_wikipedia_links(soup):
    """Collects all links that redirect to another Wikipedia page."""
    links = set()
    base_url = "https://en.wikipedia.org"
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('/wiki/') and ':' not in href:  # Exclude special links like files
            full_url = urljoin(base_url, href)
            links.add(full_url)
    print("\nWikipedia Links:")
    for link in links:
        print(f"- {link}")
    return links

In [7]:
def scrape_wikipedia_page(soup):
    """Wraps all functions to scrape a Wikipedia page."""
    print("\nScraping Wikipedia Page...")
    return {
        "title": get_article_titles(soup),
        "text_by_heading": get_article_text(soup),
        "wikipedia_links": extract_wikipedia_links(soup)
    }

In [None]:
# Example usage
scraped_data = scrape_wikipedia_page(soup)
print("\nScraped Data:")
print(scraped_data)