In [1]:
# !/usr/bin/env python3

import time
import requests
from bs4 import BeautifulSoup
import logging

# 初始化日志记录器
logging.basicConfig(filename='course_monitor.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 创建一个流处理程序并将其添加到根日志记录器
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
logging.getLogger().addHandler(stream_handler)

def get_response_soup(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    response = requests.get(url, headers=headers)
    
    # 记录服务器返回的状态码和部分响应内容
    logging.info(f"Server response code: {response.status_code}")
    logging.info(f"Response preview: {response.text[:500]}...")  # 只预览前500个字符
    
    response.raise_for_status()  # Raise an error for bad responses

    soup = BeautifulSoup(response.text, 'html.parser')

    return soup

def format_courses_info(courses_info):
    formatted_output = []
    
    for course in courses_info:
        course_name = course['Course Name']
        class_name = course["Class Name"]
        seats_info = course['Seats Info']
        
        formatted_output.append(f"Course: {course_name}\n")
        
        for i in range(len(class_name)):
            formatted_output.append(f"{class_name[i]}: {seats_info[i]}\n")
        
        formatted_output.append("\n")  # Add an empty line for better separation
    
    return "".join(formatted_output)


def get_course_info(soup):
    course_divs = soup.select("div.course")

    courses_info = []

    for course_div in course_divs:
        # Extract course name
        course_name = course_div.select_one("h3").text.strip().replace("\n", "").replace("         ", " ")
        
        # Extract course statuses
        statuses = [status.text.strip() for status in course_div.select("span.status")]
        
        # Extract seats information
        seats_info = [seat_info.text.strip().replace('\n', ', ') for seat_info in course_div.select("td.class-availability div.class-cell")]

        # Extract class number
        class_numbers = [class_num.text.strip() for class_num in course_div.select("td.class-number div.class-cell")]

        class_location = [class_num.text.strip() for class_num in course_div.select("td.class-location div.class-cell")]
        class_date = [class_num.text.strip() for class_num in course_div.select("td.class-dates div.class-cell")]
        class_days = [class_num.text.strip() for class_num in course_div.select("td.class-days div.class-cell")]
        class_times = [class_num.text.strip() for class_num in course_div.select("td.class-times div.class-cell")]
        class_instructors = [class_num.text.strip() for class_num in course_div.select("td.class-instructors div.class-cell")]
        class_availability = [class_num.text.strip() for class_num in course_div.select("td.class-availability div.class-cell")]


        courses_info.append({
            "Course Full Name": course_name,
            "Class Name": class_numbers,
            "Class Location": class_location,
            "Class Date": class_date,
            "Class Days": class_days,
            "Class Instructors": class_instructors,
            "Class Availability": class_availability,
            "Statuses": statuses,
            "Seats Info": seats_info
        })

    return courses_info

In [2]:
CHECK_URLS = {
    # 'CSC110AB': "https://classes.sis.maricopa.edu/?keywords=CSC110AB&all_classes=true&terms%5B%5D=4236&institutions%5B%5D=SCC05&subject_code=&credit_career=B&credits_min=gte0&credits_max=lte9&start_hour=&end_hour=&startafter=&instructors=&as_sfid=AAAAAAVvO4TuRoGQ5STRwkkeOmCI-UNoiSkGCLjgGucYfO_0q6lCHelal_WHXp7jYLxeHOockS1E--az6QDqwAPNllcrXy9gsaST3ankBBESUe8GMvdLey3JLIM6ZgdSzLqzkS8T0ApOZU5QcjrPbsDMxPt6nBp0PVrRbmyW0jpYI3yTGQ%3D%3D&as_fid=1c384455b31d6492a305528cfadd9fd843e71efb",
    # 'MAT220': "https://classes.sis.maricopa.edu/?keywords=MAT220&all_classes=true&terms%5B%5D=4236&institutions%5B%5D=SCC05&subject_code=&credit_career=B&credits_min=gte0&credits_max=lte9&start_hour=&end_hour=&startafter=&instructors=&as_sfid=AAAAAAUz7hAUBAfxzDpPBOq-NYmfU15JF7iJtMskY7miFK4HpJs7iBcZnNwyPvwtDuP4akVKqs4BuU3b4XE2Vss8C0n5UKmcsd2Tg9I0K-I4gZ3eZ9slSLmPRvwiN0-eSsIVWITT7UNoLG8e8RifNyvOBCuN3ivICUe5GgmzDjY1MDjVFw%3D%3D&as_fid=022336d20fdc486f0c2bf51509376b77aa969c67",
    # 'ALL': 'https://classes.sis.maricopa.edu/?all_classes=true&terms%5B%5D=4236&institutions%5B%5D=SCC05&credit_career=B',
    "MATH": 'https://classes.sis.maricopa.edu/?keywords=mat&all_classes=true&terms%5B%5D=4244&institutions%5B%5D=SCC05&subject_code=&credit_career=B&credits_min=gte0&credits_max=lte9&start_hour=&end_hour=&startafter=&instructors=&as_sfid=AAAAAAX7Xw3TJ9zObwVUJ-3oOvN2QBd1zzbuqY6KSfOIEgsMqfmlD6_x6FLLPJyS8gF0NTB2hft9-qRLr2QDG3xK6xEO1tiBE2CM3Tqt1-0zZxr-IipIGARFAhcvLGmVOjeueHehrNhzGt2r4svel71Nr5fQzCtQLgqGQ_5PcDVIQMFcKw%3D%3D&as_fid=765d427272800033a48387228da3779071a948a9'

}


soup = get_response_soup(CHECK_URLS['MATH'])

Server response code: 200
Response preview: <!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">

              <link rel="stylesheet" href="/build/app.95047ca5.css">
    
  <link rel="stylesheet" href="/build/results.0601686f.css">
    <title>Find a Class</title>
            <!-- Google Tag Manager -->
        <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
                    new Date().getTime(),event:'gtm.js'});var f=d.getElements...


In [3]:
courses_info = get_course_info(soup)

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame.from_dict(courses_info)
course_df = df.explode(['Seats Info', 'Statuses'] + [_ for _ in df.columns if _.startswith('Class')])

course_df['Category'] = course_df['Course Full Name'].str[:3]
course_df['Course Name'] = course_df['Course Full Name'].str.extract(r':\s*(.*?)\s*–').to_numpy()

course_df = course_df.replace('\n', ' ', regex=True)

In [6]:
import re

def extract_dates(row):
    match = re.search(r'(\d{1,2}/\d{1,2})\s*–\s*(\d{1,2}/\d{1,2})', row['Class Date'])
    if match:
        start_date = match.group(1) + " " + row['Class Date'].split()[-1]
        end_date = match.group(2) + " " + row['Class Date'].split()[-1]
        return start_date, end_date
    else:
        return None, None

In [7]:
course_df['start_date'], course_df['end_date'] = zip(*course_df.apply(extract_dates, axis=1))

In [9]:
course_df.head()

Unnamed: 0,Course Full Name,Class Name,Class Location,Class Date,Class Days,Class Instructors,Class Availability,Statuses,Seats Info,Category,Course Name,start_date,end_date
0,MAT103: College Mathematics Prep – 3 credits,14059,Scottsdale Online Course,05/28 – 07/18 Summer 2024,,T. Baurichter,Open 16 of 24 Seats Available,Open,"Open, 16 of 24 Seats Available",MAT,College Mathematics Prep,05/28 2024,07/18 2024
1,MAT112: Mathematical Concepts and Applications...,14103,Scottsdale Online Course,05/28 – 07/18 Summer 2024,,A. Uda,Open 16 of 24 Seats Available,Open,"Open, 16 of 24 Seats Available",MAT,Mathematical Concepts and Applications,05/28 2024,07/18 2024
2,MAT114: College Algebra Prep – 4 credits,14061,Scottsdale Online Course,05/28 – 07/18 Summer 2024,,D. Nearing,Open 4 of 24 Seats Available,Open,"Open, 4 of 24 Seats Available",MAT,College Algebra Prep,05/28 2024,07/18 2024
2,MAT114: College Algebra Prep – 4 credits,14162,Scottsdale Online Course,06/10 – 08/01 Summer 2024,,J. Bohart,Open 11 of 24 Seats Available,Open,"Open, 11 of 24 Seats Available",MAT,College Algebra Prep,06/10 2024,08/01 2024
3,MAT140: College Mathematics – 5 credits,14063,Scottsdale Online Course,06/10 – 08/01 Summer 2024,,T. Baurichter,Open 21 of 24 Seats Available,Open,"Open, 21 of 24 Seats Available",MAT,College Mathematics,06/10 2024,08/01 2024


In [96]:
course_df.to_excel("~/Downloads/course_info.xlsx", index=False)