In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:

# Write a function to Get and parse html content from a Wikipedia page
def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        return None
#  Write a function to Extract article title
def extract_title(soup):
    title = soup.find('h1', {'id': 'firstHeading'}).text
    return title

# Write a function to Extract article text for each paragraph with their respective
def extract_paragraphs_with_headings(soup):
    content = soup.find('div', {'class': 'mw-parser-output'})
    result = {}
    current_heading = None

    for element in content.children:
        if element.name and element.name.startswith('h') and element.name != 'h1':
            current_heading = element.get_text(strip=True)
            result[current_heading] = []
        elif element.name == 'p':
            if current_heading:
                result[current_heading].append(element.get_text(strip=True))

    return result

# Write a function to Extract article text for each paragraph with their respective headings. Map those headings to their respective paragraphs in the dictionary.
def collect_internal_links(soup):
    internal_links = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/wiki/') and ':' not in href:
            internal_links.add('https://en.wikipedia.org' + href)
    return list(internal_links)

# Write a function to collect every link that redirects to another Wikipedia pag
def scrape_wikipedia_page(url):
    soup = get_html_content(url)
    if soup is None:
        return None

    title = extract_title(soup)
    paragraphs_with_headings = extract_paragraphs_with_headings(soup)
    internal_links = collect_internal_links(soup)

    return {
        'title': title,
        'paragraphs_with_headings': paragraphs_with_headings,
        'internal_links': internal_links
    }

# Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

# Test the last function on a Wikipedia page of your choice

url = 'https://en.wikipedia.org/wiki/European_Cup_and_UEFA_Champions_League_records_and_statistics'
data = scrape_wikipedia_page(url)

if data:
    print("Title:", data['title'])
    print("\nParagraphs with Headings:")
    for heading, paragraphs in data['paragraphs_with_headings'].items():
        print(f"\n{heading}")
        for paragraph in paragraphs:
            print(paragraph)
    print("\nInternal Links:")
    for link in data['internal_links']:
        print(link)
else:
    print("Failed to retrieve the content")

Title: European Cup and UEFA Champions League records and statistics

Paragraphs with Headings:

Internal Links:
https://en.wikipedia.org/wiki/1956_European_Cup_Final
https://en.wikipedia.org/wiki/Shutout#Association_football
https://en.wikipedia.org/wiki/Hertha_BSC
https://en.wikipedia.org/wiki/1994_UEFA_Champions_League_Final
https://en.wikipedia.org/wiki/FC_Twente
https://en.wikipedia.org/wiki/Ukrainian_Premier_League
https://en.wikipedia.org/wiki/2006%E2%80%9307_UEFA_Champions_League_group_stage#Group_A
https://en.wikipedia.org/wiki/King_Baudouin_Stadium
https://en.wikipedia.org/wiki/2023%E2%80%9324_UEFA_Champions_League
https://en.wikipedia.org/wiki/2005%E2%80%9306_Serie_A
https://en.wikipedia.org/wiki/FC_Sachsen_Leipzig
https://en.wikipedia.org/wiki/2015_UEFA_Champions_League_Final
https://en.wikipedia.org/wiki/Wim_Kieft
https://en.wikipedia.org/wiki/West_Germany
https://en.wikipedia.org/wiki/2000_DFB-Ligapokal
https://en.wikipedia.org/wiki/S.S._Lazio
https://en.wikipedia.org/wik