<a href="https://colab.research.google.com/github/Vhyvean/alx-pre_course/blob/master/Web_Scraping_Wikipedia_Checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests # Import the requests library for making HTTP requests
from bs4 import BeautifulSoup # Import BeautifulSoup from the bs4 library for parsing HTML

# 1. Function to get and parse HTML content from a wikipedia page
def get_html_content(url): # Define a function to get and parse HTML content
    """
    Fetches and parses the HTML content of a given Wikipedia page.
    Returns a BeautifulSoup object representing the parsed HTML.
    """
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' # Define a User-Agent header to mimic a browser
    }
    response = requests.get(url, headers=headers) # Send a request to the wikipedia URL to get the page data
    if response.status_code != 200: # Check if the request was successfull (200 means succes)
        raise Exception(f"Failed to fetch the page. Status code: {response.status_code}") # Raise an error if it failed
    soup = BeautifulSoup(response.text, 'html.parser') # Parse the HTML content using BeautifulSoup
    return soup # Return the parsed HTML for further processing

# 2. Function to extract the article title
def extract_article_title(soup): # Define a function to extract the article title
    """
    Extracts and returns the article title from the BeautifulSoup object.
    """
    title = soup.find('h1', id="firstHeading") # Find the <h1> tag with the id="firstHeading" (the article title)
    return title.text.strip() if title else "No title found" # If found, return the clean text, else, return a message

# 3. Function to extract article texts with their headings
def extract_texts_with_headings(soup): # Define a function to extract texts with their headings
    """
    Extracts all paragraphs and maps them to their nearest headings.
    Returns a dictionary: {heading: [list of paragraphs]}.
    """
    content = soup.find('div', {"id": "mw-content-text"}) # Find the main content area of the wikipedia article
    if not content: # Check if the content div was not found
        return {} # Return an empty dictionary if content is not found

    data = {} # Initialize an empty dictionary to store headings and paragraphs
    current_heading = "Introduction" # Initialize the current heading to "Introduction"
    data[current_heading] = [] # Create an empty list for the "Introduction" heading

    for tag in content.find_all(['h2', 'h3', 'p'], recursive=True): # Loop through all tags (headings and paragraphs) in the content
        if tag.name in ['h2', 'h3']: # Check if the tag is an h2 or h3 (a heading)
            current_heading = tag.text.replace("[edit]", "").strip() # Set the current heading to the text of the heading tag, removing "[edit]" and stripping whitespace
            data[current_heading] = [] # Create an empty list for the new heading
        elif tag.name == 'p': # Check if the tag is a p (a paragraph)
            # Added 'paragraph =' to create a variable before accessing its attribute
            paragraph = tag # Assign the current tag to the paragraph variable
            paragraph_text = paragraph.text.strip() # Get the text of the paragraph and remove extra spaces
            if paragraph_text: # Check if the paragraph text is not empty
                data[current_heading].append(paragraph_text) # Append the paragraph text to the list for the current heading

    return data # Return all headings with their corresponding paragraphs

# 4. Function to collect internal Wikipedia links
def extract_internal_links(soup): # Define a function to extract internal links
    """
    Extracts all internal Wikipedia links (i.e., links starting with '/wiki/').
    Returns a list of full URLs.
    """
    base_url = "https://en.wikipedia.org" # The main wikipedia domain
    links = [] # Create an empty list to store internal links

    for a_tag in soup.find_all('a', href=True): # Find all <a> tags with an 'href' attribute
        href = a_tag['href'] # Get the value of the 'href' attribute (the actual link)
        if href.startswith('/wiki/') and not ':' in href: # Only collect links to wikipedia articles (skip special pages)
            full_link = base_url + href # Combine the base URL with the relative link
            links.append(full_link) # Add the complete link to the list

    return list(set(links)) # Remove duplicates and return a list of unique internal links

# 5. Function to consolidate all the above functions
def extract_wikipedia_data(url): # Define a function to consolidate all extraction tasks
  """
  Consolidates all extraction tasks:
  - Fetch HTML
  - Extract title
  - Extract text with headings
  - Extract internal links
  Return a dictionary with all data
  """
  soup = get_html_content(url) # Get the parsed the HTML content
  title = extract_article_title(soup) # Extract the article title
  texts_with_headings = extract_texts_with_headings(soup) # Extract texts and map it toheadings
  internal_links = extract_internal_links(soup) # Extract all internal wikipedia links

  result = { # Combine all extracted data into a single dictionary
      'Title': title, # Add the title to the dictionary
      'Content': texts_with_headings, # Add the content (headings and paragraphs) to the dictionary
      'Internal Links': internal_links # Add the internal links to the dictionary
  }

  return result # Return the dictionary containing all extracted data

# 6. Test the final function on a Wikipedia page
if __name__ == "__main__": # Ensures the test only runs when you run this file directly
  test_url = "https://en.wikipedia.org/wiki/Python_(programming_language)" # Example wikipedia page to test with

  data = extract_wikipedia_data(test_url) # Run our main function and get all data

  print("Article Title:") # Print a label for the article title
  print(data["Title"]) # Print the title of the wikipedia page

  print("\nHeadings and Paragraphs:") # Print a label for headings and paragraphs
  for heading, paragraphs in data["Content"].items(): # Loop through the headings and their associated paragraphs
      print(f"\n{heading}:") # Print the current heading
      for p in paragraphs[:2]: # Iterate through the first two paragraphs for each heading
          print(f"  - {p[:150]}...") # Print the first 150 characters of each paragraph with an indentation

  print("\nNumber of Internal Links Found:", len(data["Internal Links"])) # Print the number of internal links found

Article Title:
Python (programming language)

Headings and Paragraphs:

Introduction:
  - Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentatio...
  - Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), ...

History:
  - Python was conceived in the late 1980s[41] by Guido van Rossum at Centrum Wiskunde & Informatica (CWI) in the Netherlands.[42] It was designed as a su...
  - The name Python derives from the British comedy series Monty Python's Flying Circus.[47] (See § Naming.)...

Design philosophy and features:
  - Python is a multi-paradigm programming language. Object-oriented programming and structured programming are fully supported, and many of their feature...
  - Python uses dynamic typing and a combination of reference counting and a cycle-detecting garbage collector for memory manag