Parse the Data into a nested json structure where the key is the subject and value is the content scraped for that subject

In [None]:
import requests
from bs4 import BeautifulSoup, Tag
from urllib.parse import urlparse


def fetch_content(url):
    response = requests.get(url)
    return response.text

def parse_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def is_valid_url(url):
    try:
        result = urlparse(url)
        # Check if the scheme (http or https) and netloc (domain) are present
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def extract_summary(html):
    soup = BeautifulSoup(html, 'html.parser')
    summary_parts = []

    # Add the title as part of the summary
    title = soup.find('title')
    if title:
        summary_parts.append(title.get_text())

    # Extract main headings (h1)
    for h1 in soup.find_all('h1'):
        summary_parts.append(h1.get_text(strip=True))

    # Extract first paragraph after each main heading
    for h1 in soup.find_all('h1'):
        next_p = h1.find_next_sibling('p')
        if next_p:
            summary_parts.append(next_p.get_text(strip=True))

    return ' '.join(summary_parts)

def extract_content(soup):
    try:
        main_heading = soup.find('h1', id='firstHeading').text.strip()
        content_div = soup.find('div', id='bodyContent')
        sections = {}

        for headline in content_div.find_all('span', class_='mw-headline'):
          current_section = headline.text.strip()
          section_content = []
          codes = []

          # Check the next siblings until another headline or end of div
          for sibling in headline.parent.find_next_siblings():
              if sibling.find('span', class_='mw-headline'):
                  break  # Stop if another section starts

              # Handle pre-formatted code blocks
              if sibling.name == 'pre':
                  codes.append(sibling.text.strip())

              # Handle other types of content
              elif sibling.text:
                  # Go through each element inside the sibling (text or links)
                  for element in sibling.descendants:
                      if element.name == 'a' and element.get('href'):
                          # Get the href link, and exclude .png links
                          href = element['href']
                          if not href.endswith('.png'):
                              # Append link text and URL in the format [text](URL)
                              section_content.append(f"{element.get_text(strip=True)} [{href}]")
                      elif isinstance(element, str):
                          # Append plain text directly
                          section_content.append(element.strip())

          # Join all section content into one string and handle further as needed
          final_content = ' '.join(section_content)

            # Now you can use final_content or store it for further processing

          # Saving content, links, and codes for the current section
          sections[current_section] = {
              'content': final_content,
              'code': codes
          }

        return main_heading, sections
    except:
      print("Error")

def extract_external_content(url):
  summary = {}
  baseUrl = "https://wiki.umiacs.umd.edu/"
  cleaned_url = url.replace(baseUrl, "")
  if cleaned_url in externalLinkDict:
    externalLinks = externalLinkDict[cleaned_url]
    for link in externalLinks:
      if is_valid_url(link):
        response = requests.get(link)
        html = response.text
        summary[link] = extract_summary(html)
  return summary

def scrape_wiki(url):
    html = fetch_content(url)
    soup = parse_content(html)
    main_heading, sections = extract_content(soup)
    externalContent = extract_external_content(url)
    return {main_heading: sections,
            'external_info': externalContent}

# Example URL
url = 'https://wiki.umiacs.umd.edu//umiacs/index.php/Nexus/MC2'
wiki_content = scrape_wiki(url)
print(wiki_content)



In [None]:
# Create a master dictionary
linkPage = {}
baseURL = 'https://wiki.umiacs.umd.edu'
i = 1
master_dictionary = {}
for link in links:
    print("Parsing Link " + str(i))
    i += 1
    href = link.get('href')

    # If the link leads to a page on the wiki, we can parse the text
    if href and href.startswith('/umiacs/index.php'):
        pageUrl= baseURL + href
        try:
          wiki_content = scrape_wiki(pageUrl)
        except:
          print(pageUrl)
        wiki_content['url'] = href # can index the pages based on their base url
        master_dictionary[href] = wiki_content

In [None]:
import json

# Define the file path where you want to save the JSON
file_path = 'wiki_content.json'

# Open a file in write mode ('w') and save the dictionary as JSON
with open(file_path, 'w') as json_file:
    json.dump(master_dictionary, json_file, indent=4)