In [None]:
# COLLECTING COURSE NAMES AND CODE
"""
USM Course Scraper
------------------
This script scrapes course data from the University of Southern Maine's online catalog.
It navigates through multiple catalog pages, extracts course names, codes, and related
details, and stores them in a structured CSV file located in the 'data' folder.

Key Features:
- Fetches and parses course listings from multiple pages.
- Deduplicates entries based on course name and code.
- Saves results to: ../data/usm_courses.csv
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re
from typing import List, Dict, Any
import os

# Base URL pattern
base_url = "https://catalog.usm.maine.edu/content.php"
params_base = {
    "catoid": 3,
    "navoid": 80,
    "filter[item_type]": 3,
    "filter[only_active]": 1,
    "filter[3]": 1,
    # filter[cpage] will be changed dynamically
}

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

# Correctly Detect total number of pages
# Probing a page that contains content (e.g., Page 2) to find the pagination control.
print("Detecting total number of pages from a content page (Page 2)...")

test_params = params_base.copy()
# Probing Page 2, as Page 1 is an empty search form
test_params["filter[cpage]"] = 2

try:
    response = requests.get(base_url, headers=headers, params=test_params)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    # Looking for the pagination text, e.g., "Page 2 of 18"
    # Looking for the text node containing the pattern
    page_info = soup.find(string=re.compile(r"Page\s+\d+\s+of\s+\d+"))

    if page_info:
        # Extracting the total number (the second digit)
        match = re.search(r"Page\s+\d+\s+of\s+(\d+)", page_info)
        total_pages = int(match.group(1)) if match else 1
    else:
        # Fallback to a safe number if detection fails, assuming at least a few pages exist
        total_pages = 18
        print(f"could not find exact page count. Assuming {total_pages} pages for safe measure.")

except requests.exceptions.RequestException as e:
    print(f"error during page detection: {e}. Aborting scrape.")
    total_pages = 0


print(f"Total pages detected: {total_pages}")

# Looping through each page, starting from 2
all_courses: List[Dict[str, Any]] = []

start_page = 2 if total_pages >= 2 else 1
for page in range(start_page, total_pages + 1):
    print(f"Scraping course page {page}/{total_pages}...")

    params = params_base.copy()
    params["filter[cpage]"] = page

    try:
        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"HTTP Error scraping page {page}: {e}. Stopping pagination.")
        break

    soup = BeautifulSoup(response.text, "html.parser")

    # Finding all course links using 'preview_course_nopop.php' URL fragment
    course_links = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)

    if not course_links:
        print(f"No courses found on page {page}. Assuming end of course listings.")
        # If a page in the middle is empty, we assume the catalog has ended
        break

    # Extracting course info
    for a in course_links:
        text = a.get_text(strip=True)
        # Constructing a full URL using urljoin
        href = urljoin(base_url, a.get("href"))

        if " - " in text:
            code, name = text.split(" - ", 1)
        else:
            # Handle cases where " - " might be missing (use the full text as the name)
            code, name = "N/A", text

        all_courses.append({
            "course_code": code.strip(),
            "course_name": name.strip(),
            "course_link": href,
            "page_number": page
        })

    time.sleep(0.5)  # Be polite to the server :)

# Saving to CSV
if all_courses:
    print(f"\n Scraped {len(all_courses)} total courses across {total_pages} pages.")

    df = pd.DataFrame(all_courses)
    # Deduplicating based on unique code/name pairs
    df.drop_duplicates(subset=["course_code", "course_name"], inplace=True)
    df.reset_index(drop=True, inplace=True)

    output_filename = "usm_courses.csv"
    current_dir = os.getcwd()
    data_dir = os.path.join(current_dir, "..", "data")
    os.makedirs(data_dir, exist_ok=True)
    output_path = os.path.join(data_dir, output_filename)

    # Save CSV in the data folder
    df.to_csv(output_path, index=False)
    print(f"Saved {len(df)} unique courses.")
else:
    print("\n FAILURE: No course data was successfully scraped.")

# Display the first 20 records for quick verification (Testing)
# display(df.head(20))


Detecting total number of pages from a content page (Page 2)...
could not find exact page count. Assuming 18 pages for safe measure.
Total pages detected: 18
Scraping course page 2/18...
Scraping course page 3/18...
Scraping course page 4/18...
Scraping course page 5/18...
Scraping course page 6/18...
Scraping course page 7/18...
Scraping course page 8/18...
Scraping course page 9/18...
Scraping course page 10/18...
Scraping course page 11/18...
Scraping course page 12/18...
Scraping course page 13/18...
Scraping course page 14/18...
Scraping course page 15/18...
Scraping course page 16/18...
Scraping course page 17/18...
Scraping course page 18/18...

 Scraped 1699 total courses across 18 pages.
Saved 1699 unique courses.
