In [25]:
!pip install openai
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import base64
import openai
from google.colab import userdata



In [68]:
# First, let's grab the links to all the pages on the wiki
allPages = 'https://wiki.umiacs.umd.edu/umiacs/index.php/Special:AllPages'
response = requests.get(allPages)
soup = BeautifulSoup(response.content, 'html.parser')

pageset = set()

# Create a set of unique pages on the wiki
for link in soup.find_all('a'):
    href = link.get('href')

    # Remove all links which redirect to links that we've already gone to
    if href and href.startswith('/umiacs/index.php'):
        response = requests.get('https://wiki.umiacs.umd.edu' + href, allow_redirects=True)
        response_url = response.url
        pageset.add(response_url)


Success
344


In [63]:
# This function uses GPT to describe the image in the context of the current page

def describeImage(context, encodedImage):
    backgroundInfo = '''
    I am training a large language model (LLM) using information from a wiki,
    including both text and images.
    My goal is to convert all images from the wiki into detailed textual
    descriptions that the LLM can use for training.
    Your task is to describe each image in comprehensive detail, capturing
    all essential visual elements such as objects,
    people, actions, text, colors, and spatial relationships, as well as any
    implicit meaning the image conveys.
    The description should be thorough enough that the model could learn all
    relevant information from the image
    without actually seeing it.

    Please avoid vague descriptions and focus on specific details,
    ensuring the description accurately reflects the visual content. Also avoid
    any hallucinations.
    After I provide the wiki text and multiple image, provide the image description in full.
    Describe each image fully in a maximum of 200 words.
    '''

    headers = {
      "Content-Type": "application/json",
      # Pass in our api key
      "Authorization": f"Bearer {userdata.get('OPEN_API_KEY')}"
    }

    payload = {
      "model": "gpt-4o-mini",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"{backgroundInfo}. Now here is the text for the wiki page: {context}"
            },
            {
              "type": "image_url",
              "image_url": {
                "url": encodedImage
              }
            },
          ]
        }
      ],
      "max_tokens": 500
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()['choices'][0]['message']['content']

In [87]:
def getPageAsText(url, getImages = True):
  # Fetch the webpage
  response = requests.get(url)
  html = response.text
  soup = BeautifulSoup(html, 'html.parser')

  # Find all a tags and add a note saying where they link to
  pageLinks = soup.find_all('a')

  for link in pageLinks:
    href = link.get('href')
    prefixes = ['/umiacs/index.php', '#', 'https://wiki.umiacs.umd.edu/umiacs/index.php', 'javascript']
    if href and not href.lower().startswith(tuple(prefixes)):
      p_tag = soup.new_tag('p')
      p_tag.string = link.getText() + f"(The prior text links to {href})"
      link.replace_with(p_tag)

  # Find all h1 tags and mark them so we can chunk by headers if needed
  hTags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

  for tag in hTags:
    tag.string = f"\uFE17{tag.string}"

  if getImages:
    # Find all image tags and convert the images to text
    images = soup.find_all('img')
    for img in images:
        modified_html = soup.find('body').get_text()
        img_url = img.get('src')

        # Ignore the footer image that appears on every page
        if img_url == '/umiacs/resources/assets/poweredby_mediawiki_88x31.png':
          continue

        # If the image URL is relative, we need to join it with the base URL
        if img_url.startswith('/'):
            img_url = requests.compat.urljoin(url, img_url)

        try:
            img_data = requests.get(img_url).content

            # Convert the image to Base64
            img_base64 = base64.b64encode(img_data).decode('utf-8')
            img_format = img_url.split('.')[-1]  # Get the file extension
            base64_src = f'data:image/{img_format};base64,{img_base64}'
            imageAsText = describeImage(modified_html, base64_src)

            # Replace the img tag with the description from GPT
            p_tag = soup.new_tag('p')
            p_tag.string = imageAsText

            # Replace the <img> tag with the new <p> tag
            img.replace_with(p_tag)

        except Exception as e:
            print(f"Error processing image {img_url}: {e}")

  modified_html = soup.find('body').get_text()

  # Return all the text surrounded by two unicode characters to make it easier to find where each page starts/ends
  return (f"\u3018The following url: {url} contains the following information: {modified_html}\u3019")

In [31]:
def base64ToFile(imageString):
  # Finds the text between the unicode starting and ending characters
  removeUnicode = imageString.split("\u3016")[1].split("\u3017")[0]
  # Finds the format of the image
  format = removeUnicode.split('data:image/')[1].split(';')[0]
  # Finds the base64 text of the iamge
  base64Text = removeUnicode.split(";base64,")[1]
  imgData = base64.b64decode(base64Text)

  # Step 2: Write the bytes to a file
  with open(f'output_image.{format}', 'wb') as imgFile:  # You can change the file extension as needed
      imgFile.write(imgData)

  print("Image saved successfully!")


In [72]:
open('scrapedText.txt', 'w').close()

# Here we parse through all the pages on the wiki and write the info to a file

# This cannot parse dynamic pages, you have to use some other library like Selenium

i = 1
for link in pageset:
    print("Parsing Link " + str(i)  + " url is " + str(link))
    i += 1
    with open('scrapedText.txt', 'a') as file:
        text = getPageAsText(link)
        file.write(text)

Parsing Link 1url is https://wiki.umiacs.umd.edu/umiacs/index.php/OBJ/WebHosting
Parsing Link 2url is https://wiki.umiacs.umd.edu/umiacs/index.php/BarracudaSpamFirewall/Scoring
Parsing Link 3url is https://wiki.umiacs.umd.edu/umiacs/index.php/Singularity
Parsing Link 4url is https://wiki.umiacs.umd.edu/umiacs/index.php/PythonVirtualEnv
Parsing Link 5url is https://wiki.umiacs.umd.edu/umiacs/index.php/Iribe/ConferenceRooms/HuddleRoom
Parsing Link 6url is https://wiki.umiacs.umd.edu/umiacs/index.php/Main_Page
Parsing Link 7url is https://wiki.umiacs.umd.edu/umiacs/index.php/Network/VPN/Linux
Parsing Link 8url is https://wiki.umiacs.umd.edu/umiacs/index.php/Modules
Parsing Link 9url is https://wiki.umiacs.umd.edu/umiacs/index.php/ConferenceRooms/ATL3100C
Parsing Link 10url is https://wiki.umiacs.umd.edu/umiacs/index.php/CollaboratorAccount
Parsing Link 11url is https://wiki.umiacs.umd.edu/umiacs/index.php/Iribe/ConferenceRooms/Moderation
Parsing Link 12url is https://wiki.umiacs.umd.edu/u

In [81]:
externalLinksWithImage = ['https://wiki.umiacs.umd.edu/umiacs/index.php/BitLocker/PersonalUse',
                          'https://umd.service-now.com/itsupport?id=kb_article_view&sysparm_article=KB0013664']
externalLinksWithoutImage = ['https://intranet.umiacs.umd.edu/',
                             'https://gitlab.umiacs.umd.edu/docker/jekyll-builder/-/blob/master/README.md?ref_type=heads',
                             'https://gitlab.umiacs.umd.edu/help/user/ssh.md',
                             'https://wiki.umiacs.umd.edu/clip/index.php/Main_Page',
                             'https://www.umiacs.umd.edu/about-us',
                             'https://helpdesk.cs.umd.edu/faq/iribe/equipment-peripherals.html',
                             'https://policies.umd.edu/miscellaneous-policies/university-of-maryland-policy-on-acceptable-use-of-information-technology-resources',
                             'https://gitlab.umiacs.umd.edu/derek/gpudocker/-/blob/master/README.md?ref_type=heads',
                             'https://umd.service-now.com/itsupport?id=kb_article_view&sysparm_article=KB0015077',
                             'https://umd.service-now.com/itsupport/?id=kb_article_view&sysparm_article=KB0011227',
                            ]

In [82]:
for link in externalLinksWithoutImage:
    with open('scrapedText.txt', 'a') as file:
        text = getPageAsText(link, getImages = False)
        file.write(text)

In [80]:
for link in externalLinksWithImage:
    with open('scrapedText.txt', 'a') as file:
        text = getPageAsText(link)
        file.write(text)