In [9]:
import requests
from bs4 import BeautifulSoup, Comment
import os

In [10]:
# First, let's grab the links to all the pages on the wiki
allPages = 'https://wiki.umiacs.umd.edu/umiacs/index.php/Special:AllPages?from=&to=&namespace=0&hideredirects=1'
response = requests.get(allPages)
soup = BeautifulSoup(response.content, 'html.parser')

pageset = set()

# Create a set of unique pages on the wiki
for link in soup.find_all('a'):
    href = link.get('href')

    # Remove all links which redirect to links that we've already gone to
    if href and href.startswith('/umiacs/index.php') and "Special:UserLogin" not in href:
        pageset.add('https://wiki.umiacs.umd.edu' + href)


In [11]:
len(pageset)

266

In [12]:
def getHTML(url):
  """
  input: url string
  
  output: (filename, html)
  """
  # name is the part after index.php. we replace / because it's not allowed
  name = url.split(".php/")[1].replace("/", "_")
  # replace colon
  name = name.replace(":", "-")
  # Fetch the webpage
  response = requests.get(url)
  html = response.text

  soup = BeautifulSoup(html, "html.parser")

  comments = soup.find_all(string=lambda text: isinstance(text, Comment))
  for comment in comments:
    comment.extract()

  html = str(soup.prettify())

  # print(response.text)
  return name, html

In [13]:
getHTML(next(iter(pageset)))

('Branding',
 '<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   Branding - UMIACS\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ZxFWw-yrVQrmCZr16NTk8AAAAAo","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Branding","wgTitle":"Branding","wgCurRevisionId":8455,"wgRevisionId":8455,"wgArticleId":2802,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Branding","wgRelevantArticleId":2802,"wgIsProbablyEditable":false,"wgR

In [14]:
#create directory if doesn't exist
directory = "dataset/raw_html"
if not os.path.exists(directory):
    os.makedirs(directory)

In [15]:
# write each link to new file

for i, link in enumerate(pageset):
    print(f"link #{i}: {link}")
    name, html = getHTML(link)
    with open(f'{directory}/{name}.html', 'w') as file:
        file.write(html)
        print(f"wrote to {file.name}")

link #0: https://wiki.umiacs.umd.edu/umiacs/index.php/Branding
wrote to dataset/raw_html/Branding.html
link #1: https://wiki.umiacs.umd.edu/umiacs/index.php/EnvironmentalVariables
wrote to dataset/raw_html/EnvironmentalVariables.html
link #2: https://wiki.umiacs.umd.edu/umiacs/index.php/Iribe/ConferenceRooms/AutoAccept
wrote to dataset/raw_html/Iribe_ConferenceRooms_AutoAccept.html
link #3: https://wiki.umiacs.umd.edu/umiacs/index.php/JavaDisableBrowser
wrote to dataset/raw_html/JavaDisableBrowser.html
link #4: https://wiki.umiacs.umd.edu/umiacs/index.php/Network/VPN/Mobile
wrote to dataset/raw_html/Network_VPN_Mobile.html
link #5: https://wiki.umiacs.umd.edu/umiacs/index.php/Rclone
wrote to dataset/raw_html/Rclone.html
link #6: https://wiki.umiacs.umd.edu/umiacs/index.php/BitLocker/PersonalUse
wrote to dataset/raw_html/BitLocker_PersonalUse.html
link #7: https://wiki.umiacs.umd.edu/umiacs/index.php/S3Clients
wrote to dataset/raw_html/S3Clients.html
link #8: https://wiki.umiacs.umd.edu