In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.chrome.options import Options
import os

In [2]:
chromeDriver = r"path to your drive"

In [3]:
# Function to close any popups (e.g., GDPR consent or ads)
def close_popups():
    try:
        # Close the GDPR consent popup if it exists
        close_button = WebDriverWait(driver, 2).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Close')]"))
        )
        close_button.click()
        print("Closed popup.")
    except:
        print("No popup to close.")

# Function to get the HTML content of the professor's page with all comments loaded
def get_professor_page_with_all_comments(professor_id):
    url = f"https://www.ratemyprofessors.com/ShowRatings.jsp?tid={professor_id}"
    driver.get(url)
    
    # Wait for the page to load
    time.sleep(2)
    
    # Close any popups before proceeding
    close_popups()
    
    # Scroll and click "Load More Ratings" until all comments are loaded
    while True:
        try:
            # Wait for the "Load More Ratings" button to be clickable
            load_more_button = WebDriverWait(driver, 2).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Load More Ratings')]"))
            )
            # Scroll the button into view
            driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
            time.sleep(1)  # Wait for the button to be fully visible
            
            # Click the button
            load_more_button.click()
            print("Clicked 'Load More Ratings' button.")
            
            # Wait for new comments to load
            time.sleep(1)  # Increase this time if loading is slow
        except Exception as e:
            print("No more 'Load More Ratings' button or an error occurred:", e)
            break  # No more "Load More Ratings" button, exit the loop
    
    # Get the page source after all comments are loaded
    html = driver.page_source
    return html

In [4]:
# Function to parse meta items
def parse_meta_items(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta_items = []
    
    # 查找所有外层的 CourseMeta
    for course_meta in soup.find_all('div', class_="CourseMeta__StyledCourseMeta-x344ms-0 fPJDHT"):
        # 查找内部的 MetaItem
        for item in course_meta.find_all('div', class_="MetaItem__StyledMetaItem-y0ixml-0 LXClX"):
            # 提取键（如 "Attendance"）
            key = item.text.split(":")[0].strip()
            
            # 提取值（如 "Not Mandatory"）
            value_span = item.find('span')
            if value_span:
                value = value_span.text.strip()
            else:
                # 如果没有 <span> 标签，尝试从文本中提取值
                value = item.text.split(":")[1].strip() if ":" in item.text else None
            
            meta_items.append((key, value))
    
    return meta_items

# Function to parse comments
def parse_comments(html):
    soup = BeautifulSoup(html, 'html.parser')
    comments = []
    for item in soup.find_all('div', class_="Comments__StyledComments-dzzyvm-0 gRjWel"):
        comment = item.get_text(strip=True)
        comments.append(comment)
    return comments

# Function to parse dates
def parse_date(html):
    soup = BeautifulSoup(html, 'html.parser')
    dates = []
    for date in soup.find_all('div', class_="TimeStamp__StyledTimeStamp-sc-9q2r30-0 bXQmMr RatingHeader__RatingTimeStamp-sc-1dlkqw1-4 iwwYJD"):
        date_text = date.get_text(strip=True)
        dates.append(date_text)
    return dates
    
def parse_course_name(html):
    soup = BeautifulSoup(html, 'html.parser')
    course_names = []
    for course_div in soup.find_all('div', class_="RatingHeader__StyledClass-sc-1dlkqw1-3 eXfReS"):
        course_name = course_div.get_text(strip=True)
        course_names.append(course_name)
    return course_names

# Function to create records and save to CSV
def create_record(meta_items, comments, dates, course_names, professor_id):
    records = []
    
    meta_dicts = []
    for i in range(0, len(meta_items), 5):
        meta_dict = {
            "For Credit": None,
            "Attendance": None,
            "Would Take Again": None,
            "Grade": None,
            "Textbook": None
        }
        
        for j in range(5):
            if i + j < len(meta_items):
                key, value = meta_items[i + j]
                meta_dict[key] = value
        
        meta_dicts.append(meta_dict)
    
    while len(meta_dicts) < len(comments):
        meta_dicts.append({
            "For Credit": None,
            "Attendance": None,
            "Would Take Again": None,
            "Grade": None,
            "Textbook": None
        })
    
    for i in range(len(comments)):
        record = {
            **meta_dicts[i], 
            "Course Name": course_names[i] if i < len(course_names) else None,
            "Comment": comments[i] if i < len(comments) else None,
            "Date": dates[i] if i < len(dates) else None,
            "LegacyId": professor_id  # 添加 LegacyId 列
        }
        records.append(record)
    
    df = pd.DataFrame(records)
    output_folder = "all_comments"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # 如果文件夹不存在，创建文件夹
    
    # 构建完整的文件路径
    file_path = os.path.join(output_folder, f'professor_{professor_id}_comments.csv')
    df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

# Main function to process HTML
def process_html(html_content):
    # Parse meta items, comments, and dates
    meta_items = parse_meta_items(html_content)
    comments = parse_comments(html_content)
    dates = parse_date(html_content)
    course_names = parse_course_name(html_content)
    
    print(f"Meta Items: {len(meta_items)}")
    print(f"Comments: {len(comments)}")
    print(f"Dates: {len(dates)}")
    print(f"Course Names: {len(course_names)}")
    
    # Create records and save to CSV
    create_record(meta_items, comments, dates, course_names)

def batch_process_professors(professor_ids, start_index=0):
    for i, professor_id in enumerate(professor_ids[start_index:], start=start_index):
        print(f"Starting to crawl data for professor ID: {professor_id} (Row {i + 1})")
        html = get_professor_page_with_all_comments(professor_id)
        if html:
            meta_items = parse_meta_items(html)
            comments = parse_comments(html)
            dates = parse_date(html)
            course_names = parse_course_name(html)
            create_record(meta_items, comments, dates, course_names, professor_id)
        else:
            print(f"Failed to retrieve data for professor ID: {professor_id}")


In [14]:
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled") 
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-notifications") 

# Set up Selenium WebDriver
service = Service(chromeDriver)
driver = webdriver.Chrome(service=service)


In [21]:
if __name__ == "__main__":
    legacy_ids_path = "path to legacy ids csv"  
    start_index = 31  # which line to start// in case it stopped in the middle

    with open(legacy_ids_path, 'r', newline='', encoding='utf-8') as file:
        professor_ids = [line.strip() for line in file]
    
    batch_process_professors(professor_ids, start_index=start_index)
    driver.quit()

Starting to crawl data for professor ID: 2838049 (Row 32)
No popup to close.
No more 'Load More Ratings' button or an error occurred: Message: 
Stacktrace:
	GetHandleVerifier [0x0058C203+27395]
	(No symbol) [0x00523E04]
	(No symbol) [0x00421B7F]
	(No symbol) [0x00462C65]
	(No symbol) [0x00462D3B]
	(No symbol) [0x0049EC82]
	(No symbol) [0x004839E4]
	(No symbol) [0x0049CB24]
	(No symbol) [0x00483736]
	(No symbol) [0x00457541]
	(No symbol) [0x004580BD]
	GetHandleVerifier [0x00843AB3+2876339]
	GetHandleVerifier [0x00897F7D+3221629]
	GetHandleVerifier [0x0060D674+556916]
	GetHandleVerifier [0x0061478C+585868]
	(No symbol) [0x0052CE44]
	(No symbol) [0x00529858]
	(No symbol) [0x005299F7]
	(No symbol) [0x0051BF4E]
	BaseThreadInitThunk [0x7685FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x776A809E+286]
	RtlGetAppContainerNamedObjectPath [0x776A806E+238]

Data saved to all_comments\professor_2838049_comments.csv
Starting to crawl data for professor ID: 2917827 (Row 33)
No popup to close.
No more

全自动.jpg

In [6]:
def check_folder_length(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # 如果文件夹不存在，则创建
    return len(os.listdir(folder_path))


folder_path = "all_comments"  # 文件夹路径
target_length = 2399  # 目标文件夹长度: 教授个数

while check_folder_length(folder_path) < target_length:
    try:
        output_folder = "all_comments"
        if not os.path.exists(output_folder):
            print(f"Folder '{output_folder}' does not exist.")
            os.makedirs(output_folder)  # 如果文件夹不存在，创建文件夹
            start_index = 0  # 如果文件夹为空，从 0 开始
        else:
            files = os.listdir(output_folder)
            csv_files = [file for file in files if file.endswith('.csv')]
            start_index = len(csv_files)
        
        professor_ids_df = pd.read_csv("All_Prof_Data.csv")
        
        professor_ids = professor_ids_df["legacyId"].tolist()[start_index:]
        
        batch_process_professors(professor_ids)

    except Exception as e:
        # 捕获异常并打印错误信息，但不中断程序
        print(f"发生异常：{e}")
        time.sleep(1)  # 稍作等待后继续尝试

print("文件夹长度已达到目标，程序结束。")
driver.quit()

Starting to crawl data for professor ID: 2107028 (Row 1)
Closed popup.
No more 'Load More Ratings' button or an error occurred: Message: 
Stacktrace:
	GetHandleVerifier [0x0070C203+27395]
	(No symbol) [0x006A3E04]
	(No symbol) [0x005A1B7F]
	(No symbol) [0x005E2C65]
	(No symbol) [0x005E2D3B]
	(No symbol) [0x0061EC82]
	(No symbol) [0x006039E4]
	(No symbol) [0x0061CB24]
	(No symbol) [0x00603736]
	(No symbol) [0x005D7541]
	(No symbol) [0x005D80BD]
	GetHandleVerifier [0x009C3AB3+2876339]
	GetHandleVerifier [0x00A17F7D+3221629]
	GetHandleVerifier [0x0078D674+556916]
	GetHandleVerifier [0x0079478C+585868]
	(No symbol) [0x006ACE44]
	(No symbol) [0x006A9858]
	(No symbol) [0x006A99F7]
	(No symbol) [0x0069BF4E]
	BaseThreadInitThunk [0x7685FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x776A809E+286]
	RtlGetAppContainerNamedObjectPath [0x776A806E+238]

Data saved to all_comments\professor_2107028_comments.csv
Starting to crawl data for professor ID: 194034 (Row 2)
No popup to close.
No more 'Load M