<a href="https://colab.research.google.com/github/SjSterling/Cosmology/blob/main/Degree_Requiremts_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Major requirements code

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time

cached_major_links = {}

def find_majors_links(url):
    # Check if the major links have already been cached for this URL
    if url in cached_major_links:
        print("Majors Tab Found on (Cached):", url)
        return cached_major_links[url]

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    majors_tab_links = soup.select('a[href="#majorstextcontainer"]')

    if majors_tab_links:
        print("Majors Tab Found on:", url)

        # Find all links within the Majors tab
        majors_links = soup.find("div", {"id": "majorstextcontainer"}).find_all("a", href=True)

        if majors_links:
            majors_link_urls = [urljoin(url, link["href"]) for link in majors_links]

            # Cache the major links for this URL
            cached_major_links[url] = majors_link_urls

            return majors_link_urls

    return []

import csv
import os  # Add this import for file path operations

import csv
import os

def extract_tables(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return

    soup = BeautifulSoup(response.content, "html.parser")

    # Define a list of keywords to search for in heading text
    keywords = ["Requirements", "Major in", "Track"]

    # Find all tables on the page
    tables = soup.find_all("table", class_="sc_courselist")

    for table in tables:
        # Try to find a unique identifier for the table name
        table_name = None

        # Check if the table has an associated heading with a specific class or ID
        heading = table.find_previous(["h2", "h3"])
        if heading:
            heading_text = heading.text.strip()
            for keyword in keywords:
                if keyword in heading_text:
                    # Replace invalid characters in table_name so the file doesnt break bc its dumb
                    table_name = heading_text.replace("/", "-")  # Replace '/' with '-'
                    break

        if table_name:
            print("Table found on:", url)
            print("Table Name:", table_name)

            folder_name = "degree_requirements_knowledgebase"
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            csv_filename = f"{table_name}.csv"
            csv_filename = "".join([c for c in csv_filename if c.isalnum() or c in (' ', '-', '_')])


            csv_filepath = os.path.join(folder_name, csv_filename)

            with open(csv_filepath, "w", newline="") as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow(["Code", "Title", "Hours"])

                # Extracting all the dumb contents of table
                for row in table.find_all("tr"):
                    columns = row.find_all("td")
                    if len(columns) == 1:
                        code = columns[0].text.strip()
                        title = ""
                        hours = ""
                    elif len(columns) == 2:
                        code = columns[0].text.strip()
                        hours = columns[1].text.strip()
                        title = ""
                    elif len(columns) >= 3:
                        code = columns[0].text.strip()
                        title = columns[1].text.strip()
                        hours = columns[2].text.strip()
                    else:
                        code = ""
                        title = ""
                        hours = ""

                    csv_writer.writerow([code, title, hours])

            print(f"Table data saved to {csv_filepath}")
            print("---")
        else:
            print("Skipping row with no columns.")



session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

start_url = "http://catalog.valdosta.edu/undergraduate/academic-programs/"

queue = deque()
queue.append(start_url)

visited_urls = set()

while queue:
    current_url = queue.popleft()

    if current_url in visited_urls:
        continue

    try:
        # Check if the current URL ends with "/#header", "/#print-dialog", or "/#content" and skip it if so
        if current_url.endswith("/#header") or current_url.endswith("/#print-dialog") or current_url.endswith("/#content"):
            visited_urls.add(current_url)  # Mark it as visited to avoid retries
            continue

        majors_links = find_majors_links(current_url)

        for majors_link in majors_links:
            extract_tables(majors_link)

        response = session.get(current_url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            links = soup.find_all("a", href=True)

            for link in links:
                full_url = urljoin(current_url, link["href"])
                if full_url.startswith(start_url):
                    queue.append(full_url)

        visited_urls.add(current_url)
    except (requests.RequestException, ConnectionError) as e:
        print(f"Error fetching {current_url}: {e}")
        # delay so website doesnt block my ass
        time.sleep(2)


Minor requirements code

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
import os

cached_minor_links = {}

def find_minor_links(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the tab with the "Minors" keyword in the text
    tab_links = soup.find_all("a", href=lambda href: href and "#minorstextcontainer" in href)

    if tab_links:
        print("Minors Tab Found on:", url)

        # Find all links within the "Minors" tab
        tab_links = soup.find("div", {"id": "minorstextcontainer"}).find_all("a", href=True)

        if tab_links:
            tab_link_urls = [urljoin(url, link["href"]) for link in tab_links]

            return tab_link_urls

    return []

def extract_tables(url, folder_name):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return

    soup = BeautifulSoup(response.content, "html.parser")

    # Define a list of keywords to search for in heading text
    keywords = ["Requirements", "Minor in"]

    # Find all tables on the page
    tables = soup.find_all("table", class_="sc_courselist")

    for table in tables:
        # Try to find a unique identifier for the table name
        table_name = None

        # Check if the table has an associated heading with a specific class or ID
        heading = table.find_previous(["h2", "h3"])
        if heading:
            heading_text = heading.text.strip()
            # Check if any of the keywords are present in the heading text
            for keyword in keywords:
                if keyword in heading_text:
                    # Replace invalid characters in table_name
                    table_name = heading_text.replace("/", "-")  # Replace '/' with '-'
                    break

        if table_name:
            print("Table found on:", url)
            print("Table Name:", table_name)

            # Create the folder if it doesn't exist
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            # Create a valid CSV file name by removing invalid characters
            csv_filename = f"{table_name}.csv"
            csv_filename = "".join([c for c in csv_filename if c.isalnum() or c in (' ', '-', '_')])

            # Create the full path to the CSV file inside the folder
            csv_filepath = os.path.join(folder_name, csv_filename)

            with open(csv_filepath, "w", newline="") as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow(["Code", "Title", "Hours"])

                # Extract and write the contents of the current table to the CSV file
                for row in table.find_all("tr"):
                    columns = row.find_all("td")
                    if len(columns) == 1:
                        code = columns[0].text.strip()
                        title = ""
                        hours = ""
                    elif len(columns) == 2:
                        code = columns[0].text.strip()
                        hours = columns[1].text.strip()
                        title = ""
                    elif len(columns) >= 3:
                        code = columns[0].text.strip()
                        title = columns[1].text.strip()
                        hours = columns[2].text.strip()
                    else:
                        code = ""
                        title = ""
                        hours = ""

                    csv_writer.writerow([code, title, hours])

            print(f"Table data saved to {csv_filepath}")
            print("---")
        else:
            print("Skipping row with no columns.")

# Define the start URL
start_url = "http://catalog.valdosta.edu/undergraduate/academic-programs/"

# Create a session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# Define headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# Create a queue to store URLs to be visited
queue = deque()
queue.append(start_url)

# Create a set to keep track of visited URLs
visited_urls = set()

while queue:
    current_url = queue.popleft()

    if current_url in visited_urls:
        continue

    try:
        # Check if the current URL ends with "/#header", "/#print-dialog", or "/#content" and skip it if so
        if current_url.endswith("/#header") or current_url.endswith("/#print-dialog") or current_url.endswith("/#content"):
            visited_urls.add(current_url)  # Mark it as visited to avoid retries
            continue

        # Find and process minor links
        minor_links = find_minor_links(current_url)

        for minor_link in minor_links:
            extract_tables(minor_link, "minor_requirements_knowledgebase")

        # Fetch additional links to continue crawling
        response = session.get(current_url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            links = soup.find_all("a", href=True)

            for link in links:
                full_url = urljoin(current_url, link["href"])
                if full_url.startswith(start_url):
                    queue.append(full_url)

        visited_urls.add(current_url)
    except (requests.RequestException, ConnectionError) as e:
        print(f"Error fetching {current_url}: {e}")
        # Add a delay before retrying
        time.sleep(5)
