# WEBSCRAPING CHECKPOINT

**Scraping text from Wikipedia  website using Beautiful Soup**


**Instructions**

1. Write a function to Get and parse html content from a Wikipedia page

2. Write a function to Extract article title

3. Write a function to Extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in the dictionary.

4. Write a function to collect every link that redirects to another Wikipedia page

5. Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

6. Test the last function on a Wikipedia page of your choice

In [25]:
import requests 
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup 

In [9]:
#Write a function to Get and parse html content from a Wikipedia page

def Content_parser(url):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.text, 'html.parser')
        print("Successfully fetched and parsed the webpage!")
    else:
        print("Failed to fetch the webpage.")
    

In [11]:
#Write a function to Extract article title

def Content_extractor(url):
    titles = soup.find_all('title')
    for title in titles:
        print(title.get_text())
    

In [19]:
#Write a function to Extract article text for each paragraph with their respective headings.
#Map those headings to their respective paragraphs in the dictionary.

def scrape_headings_and_paragraphs(url):
    mapped_content = {}
    current_heading = None
    
    heading_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

    for element in soup.find_all(heading_tags + ['p']):
        if element.name in heading_tags:
            current_heading = element.get_text(strip=True)
            mapped_content[current_heading] = []
        elif element.name == 'p' and current_heading:
            mapped_content[current_heading].append(element.get_text(strip=True))
    
    for heading in mapped_content:
        mapped_content[heading] = ' '.join(mapped_content[heading])
        
    for heading, paragraph in mapped_content.items():
        print(f"{heading}\n{paragraph}\n")
    
    return mapped_content

In [26]:
#Write a function to collect every link that redirects to another Wikipedia page

def collect_wikipedia_links(url):
    
    # Base URL for Wikipedia
    base_url = "https://en.wikipedia.org"
    
    # Dictionary to store links and their full URLs
    links = {}
    
    # Find all anchor tags
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Check if the link is an internal Wikipedia link
        if href.startswith("/wiki/") and not ':' in href:
            full_url = urljoin(base_url, href)
            links[link.get_text(strip=True)] = full_url
    
    # Print the collected links
    for text, link in links.items():
        print(f"{text}: {link}")
    
    return links

In [29]:
#Wrap all the previous functions into a single function that takes as parameters a Wikipedia link
def web_scrapper(url): 
    def Content_parser(url):
        page = requests.get(url)
        if page.status_code == 200:
            return BeautifulSoup(page.text, 'html.parser')
        else:
            print("Failed to fetch the webpage.")
            return None
        
    def Content_extractor(url):
        titles = soup.find_all('title')
        for title in titles:
            print(title.get_text())
        
        
    def scrape_headings_and_paragraphs(url):
        mapped_content = {}
        current_heading = None
    
        heading_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

        for element in soup.find_all(heading_tags + ['p']):
            if element.name in heading_tags:
                current_heading = element.get_text(strip=True)
                mapped_content[current_heading] = []
            elif element.name == 'p' and current_heading:
                mapped_content[current_heading].append(element.get_text(strip=True))
    
        for heading in mapped_content:
            mapped_content[heading] = ' '.join(mapped_content[heading])
        
        for heading, paragraph in mapped_content.items():
            print(f"{heading}\n{paragraph}\n")
    
        return mapped_content


    def collect_wikipedia_links(url):
        base_url = "https://en.wikipedia.org"
        
        links = {}
        for link in soup.find_all('a', href=True):
            href = link['href']
       
            if href.startswith("/wiki/") and not ':' in href:
                full_url = urljoin(base_url, href)
                links[link.get_text(strip=True)] = full_url
                
        for text, link in links.items():
            print(f"{text}: {link}")
    
        return links
    
    # Main logic
    soup = Content_parser(url)
    if soup:
        Content_extractor(soup)
        scrape_headings_and_paragraphs(soup)
        collect_wikipedia_links(soup)

In [30]:
#Test the last function on a Wikipedia page of your choice
web_scrapper("https://en.wikipedia.org/wiki/Cancer")

Cancer - Wikipedia
Contents


Cancer
 Canceris a group of diseases involvingabnormal cell growthwith the potential toinvadeorspreadto other parts of the body.[2][7]These contrast withbenign tumors, which do not spread.[7]Possiblesigns and symptomsinclude a lump, abnormal bleeding, prolonged cough, unexplained weight loss, and a change inbowel movements.[1]While these symptoms may indicate cancer, they can also have other causes.[1]Over100 typesof cancers affect humans.[7]

Causes
The majority of cancers, some 90–95% of cases, are due to genetic mutations from environmental and lifestyle factors.[3]The remaining 5–10% are due toinherited genetics.[3]Environmentalrefers to any cause that is notinherited, such as lifestyle, economic, and behavioral factors and not merely pollution.[43]Common environmental factors that contribute to cancer death include tobacco use (25–30%), diet andobesity(30–35%), infections (15–20%),radiation(both ionizing and non-ionizing, up to 10%), lack ofphysical a