In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [2]:
# Setting up the Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL to scrape
url = 'https://www.ratemyprofessors.com/search/professors/675?q=*'

# Open the URL
driver.get(url)

# Handling the cookie banner
try:
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[class*='Buttons__Button'][class*='CCPAModal__StyledCloseButton']"))
    ).click()
except Exception as e:
    print("Failed to close cookie banner:", str(e))

# Scrape the data
data = []

while True:
    # Wait for data to load
    WebDriverWait(driver, 5).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[class*='TeacherCard__StyledTeacherCard']"))
    )

    # Extract data
    professor_cards = driver.find_elements(By.CSS_SELECTOR, "a[class*='TeacherCard__StyledTeacherCard']")
    for card in professor_cards:
        name = card.find_elements(By.CSS_SELECTOR, "div[class*='CardName__StyledCardName']")[0].text.strip()
        rating = card.find_elements(By.CSS_SELECTOR, "div[class*='CardNumRating__CardNumRatingNumber']")[0].text.strip()
        department = card.find_elements(By.CSS_SELECTOR, "div[class*='CardSchool__Department']")[0].text.strip()
        difficulty = card.find_elements(By.CSS_SELECTOR, "div[class*='CardFeedback__CardFeedbackNumber']")[1].text.strip()

        data.append({
            "Professor Name": name,
            "Professor Rating": rating,
            "Professor Department": department,
            "Difficulty": difficulty
        })

    # Try to click the "Show More" button
    try:
        show_more_button = WebDriverWait(driver, 7).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[class*='Buttons__Button'][class*='PaginationButton__StyledPaginationButton']"))
        )
        show_more_button.click()
    except Exception:
        print("No more 'Show More' button to click or failed to click.")
        break

# Convert data into a DataFrame
df = pd.DataFrame(data)

# Close the browser
# driver.quit()

# Print the DataFrame
df

No more 'Show More' button to click or failed to click.


Unnamed: 0,Professor Name,Professor Rating,Professor Department,Difficulty
0,Julia Keefer,4.5,Film,2.8
1,Alan Corns,2.8,Economics,4.5
2,Hasia Diner,3.8,Social Science,3.2
3,Steve Hutkins,4.5,Interdisciplinary Studies,2
4,Tyler Volk,3.5,Science,3
...,...,...,...,...
1272,Robert Gurland,4.0,Philosophy,2
1273,Diego Comin,4.9,Economics,2.8
1274,Efe A. Ok,2.6,Economics,4.4
1275,Boyan Jovanovic,3.3,Economics,3.1


In [8]:
# Give appropriate data types to the columns
df['Professor Rating'] = df['Professor Rating'].astype(float)
df['Difficulty'] = df['Difficulty'].astype(float)
df['Professor Name'] = df['Professor Name'].str.title()
df['Professor Department'] = df['Professor Department'].str.title()

In [9]:
# Get mean rating by department
mean_rating_by_department = df.groupby('Professor Department')['Professor Rating'].mean().reset_index()
mean_rating_by_department = mean_rating_by_department.sort_values(by='Professor Rating', ascending=False)
mean_rating_by_department

Unnamed: 0,Professor Department,Professor Rating
5,Classics,4.8
33,Not Specified,4.7
8,Cultural Studies,4.7
9,East Asian Studies,4.7
3,Business,4.602941
26,Law,4.6
23,Italian,4.6
13,Ethnic Studies,4.5
22,Interdisciplinary Studies,4.5
18,Hebrew,4.4


In [10]:
# Get mean difficulty by department
mean_difficulty_by_department = df.groupby('Professor Department')['Difficulty'].mean().reset_index()
mean_difficulty_by_department = mean_difficulty_by_department.sort_values(by='Difficulty', ascending=False)
mean_difficulty_by_department

Unnamed: 0,Professor Department,Difficulty
36,Physics,4.6
41,Social Work,4.1
10,Economics,4.034615
5,Classics,3.8
26,Law,3.6
0,Accounting,3.6
45,Writing,3.571429
2,Biology,3.430769
30,Mathematics,3.4125
21,Humanities,3.367857


In [12]:
# create matrix of mean rating and difficulty by department
mean_rating_difficulty_by_department = mean_rating_by_department.merge(mean_difficulty_by_department, on='Professor Department')
mean_rating_difficulty_by_department

Unnamed: 0,Professor Department,Professor Rating,Difficulty
0,Classics,4.8,3.8
1,Not Specified,4.7,1.5
2,Cultural Studies,4.7,2.3
3,East Asian Studies,4.7,3.3
4,Business,4.602941,2.702941
5,Law,4.6,3.6
6,Italian,4.6,2.5
7,Ethnic Studies,4.5,2.7
8,Interdisciplinary Studies,4.5,2.0
9,Hebrew,4.4,2.5


In [11]:
# save the data to a CSV file
df.to_csv('1400_professors.csv', index=False)