In [43]:
import requests
from bs4 import BeautifulSoup, Comment
import os

In [44]:
# First, let's grab the links to all the pages on the wiki
allPages = 'https://wiki.umiacs.umd.edu/umiacs/index.php/Special:AllPages'
response = requests.get(allPages)
soup = BeautifulSoup(response.content, 'html.parser')

pageset = set()

# Create a set of unique pages on the wiki
for link in soup.find_all('a'):
    href = link.get('href')

    # Remove all links which redirect to links that we've already gone to
    if href and href.startswith('/umiacs/index.php') and "Special:UserLogin" not in href:
        pageset.add('https://wiki.umiacs.umd.edu' + href)


In [45]:
len(pageset)

334

In [50]:
def getHTML(url):
  """
  input: url string
  
  output: (filename, html)
  """
  # name is the part after index.php. we replace / because it's not allowed
  name = url.split(".php/")[1].replace("/", "_")
  # replace colon
  name = name.replace(":", "-")
  # Fetch the webpage
  response = requests.get(url)
  html = response.text

  soup = BeautifulSoup(html, "html.parser")

  comments = soup.find_all(string=lambda text: isinstance(text, Comment))
  for comment in comments:
    comment.extract()

  html = str(soup.prettify())

  # print(response.text)
  return name, html

In [51]:
getHTML(next(iter(pageset)))

('Network_VPN_Windows',
 '<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   Network/VPN/Windows - UMIACS\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Zv3GduEoEezzhHwqBzxnJAAAAAA","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Network/VPN/Windows","wgTitle":"Network/VPN/Windows","wgCurRevisionId":12016,"wgRevisionId":12016,"wgArticleId":2082,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Network/VPN/Windows","wg

In [52]:
#create directory if doesn't exist
directory = "dataset/raw_html"
if not os.path.exists(directory):
    os.makedirs(directory)

In [53]:
# write each link to new file

for i, link in enumerate(pageset):
    print(f"link #{i}: {link}")
    name, html = getHTML(link)
    with open(f'{directory}/{name}.html', 'w') as file:
        file.write(html)
        print(f"wrote to {file.name}")

link #0: https://wiki.umiacs.umd.edu/umiacs/index.php/Network/VPN/Windows
wrote to dataset/raw_html/Network_VPN_Windows.html
link #1: https://wiki.umiacs.umd.edu/umiacs/index.php/UMIACS_Public_Printers
wrote to dataset/raw_html/UMIACS_Public_Printers.html
link #2: https://wiki.umiacs.umd.edu/umiacs/index.php/Lamsub.sh.e127
wrote to dataset/raw_html/Lamsub.sh.e127.html
link #3: https://wiki.umiacs.umd.edu/umiacs/index.php/TSM
wrote to dataset/raw_html/TSM.html
link #4: https://wiki.umiacs.umd.edu/umiacs/index.php/BarracudaSpamFirewall/Scoring
wrote to dataset/raw_html/BarracudaSpamFirewall_Scoring.html
link #5: https://wiki.umiacs.umd.edu/umiacs/index.php/ConferenceRooms/ATL3100C
wrote to dataset/raw_html/ConferenceRooms_ATL3100C.html
link #6: https://wiki.umiacs.umd.edu/umiacs/index.php/Branding
wrote to dataset/raw_html/Branding.html
link #7: https://wiki.umiacs.umd.edu/umiacs/index.php/EnvironmentalVariables
wrote to dataset/raw_html/EnvironmentalVariables.html
link #8: https://wiki.