In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import os

In [2]:
# First, let's grab the links to all the pages on the wiki
allPages = 'https://wiki.umiacs.umd.edu/umiacs/index.php/Special:AllPages?from=&to=&namespace=0&hideredirects=1'
response = requests.get(allPages)
soup = BeautifulSoup(response.content, 'html.parser')

pageset = set()

# Create a set of unique pages on the wiki
for link in soup.find_all('a'):
    href = link.get('href')

    # Remove all links which redirect to links that we've already gone to
    if href and href.startswith('/umiacs/index.php') and "Special:UserLogin" not in href and "AllPages" not in href:
        pageset.add('https://wiki.umiacs.umd.edu' + href)


In [3]:
pageset

{'https://wiki.umiacs.umd.edu/umiacs/index.php/ATL/ConferenceRooms',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Accounts',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Accounts/Collaborator',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/ActiveDirectory',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/AddingUMIACSCertificateAuthority',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Adobe',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/AlternativePickup',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Apptainer',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Archives',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Automounter',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Backups',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/BarracudaSpamFirewall',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/BarracudaSpamFirewall/QuarantinePassthrough',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/BarracudaSpamFirewall/Scoring',
 'https://wiki.umiacs.umd.edu/umiacs/index.php/Barracud

In [4]:
len(pageset)

264

In [5]:
def getHTML(url):
  """
  input: url string
  
  output: (filename, html)
  """
  # name is the part after index.php. we replace / because it's not allowed
  name = url.split(".php/")[1].split("?")[0].replace("/", "_")
  # replace colon
  name = name.replace(":", "-")
  # Fetch the webpage
  response = requests.get(url)
  html = response.text

  soup = BeautifulSoup(html, "html.parser")

  comments = soup.find_all(string=lambda text: isinstance(text, Comment))
  for comment in comments:
    comment.extract()

  html = str(soup)

  # print(response.text)
  return name, html

In [6]:
getHTML(next(iter(pageset)))

('WebCrawling',
 '<!DOCTYPE html>\n\n<html class="client-nojs" dir="ltr" lang="en">\n<head>\n<meta charset="utf-8"/>\n<title>WebCrawling - UMIACS</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ZxtFnA-Yy@@IhYSSXt939AAAAAc","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"WebCrawling","wgTitle":"WebCrawling","wgCurRevisionId":11648,"wgRevisionId":11648,"wgArticleId":3254,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"WebCrawling","wgRelevantArticleId":3254,"wgIsProbablyEditable":false,"wgRel

In [7]:
#create directory if doesn't exist
directory = "dataset/raw_html"
if not os.path.exists(directory):
    os.makedirs(directory)

In [8]:
# write each link to new file

for i, link in enumerate(pageset):
    print(f"link #{i}: {link}")
    name, html = getHTML(link)
    with open(f'{directory}/{name}.html', 'w') as file:
        file.write(html)
        print(f"wrote to {file.name}")

link #0: https://wiki.umiacs.umd.edu/umiacs/index.php/WebCrawling
wrote to dataset/raw_html/WebCrawling.html
link #1: https://wiki.umiacs.umd.edu/umiacs/index.php/Lamsub2.sh
wrote to dataset/raw_html/Lamsub2.sh.html
link #2: https://wiki.umiacs.umd.edu/umiacs/index.php/Network/VPN/Ivanti
wrote to dataset/raw_html/Network_VPN_Ivanti.html
link #3: https://wiki.umiacs.umd.edu/umiacs/index.php/EuclidCluster
wrote to dataset/raw_html/EuclidCluster.html
link #4: https://wiki.umiacs.umd.edu/umiacs/index.php/Google_Drive/Drive_for_Desktop
wrote to dataset/raw_html/Google_Drive_Drive_for_Desktop.html
link #5: https://wiki.umiacs.umd.edu/umiacs/index.php/Iribe/ConferenceRooms/View
wrote to dataset/raw_html/Iribe_ConferenceRooms_View.html
link #6: https://wiki.umiacs.umd.edu/umiacs/index.php/Setting_File_Permissions_in_Windows
wrote to dataset/raw_html/Setting_File_Permissions_in_Windows.html
link #7: https://wiki.umiacs.umd.edu/umiacs/index.php/Nexus/Accounts
wrote to dataset/raw_html/Nexus_Acco