In [None]:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

### NOTE: The following code is too resource intensive to run on my local machine. As such, I have utilized another method to retrieve the data.

In [None]:
# Set up Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL to scrape
url = 'https://www.ratemyprofessors.com/search/professors/675?q=*'

# Open the URL
driver.get(url)

print("Opened the URL:", url)

# Handle the cookie banner
try:
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[class*='Buttons__Button'][class*='CCPAModal__StyledCloseButton']"))
    ).click()
except Exception as e:
    print("Failed to close cookie banner:", str(e))
    
actions = webdriver.ActionChains(driver)

# Click the "Show More" button until it no longer appears
retries = 5
clicked = 1200  # Number of times to click 'Show More'
while retries > 0 and clicked > 0:
    try:
        show_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[class*='Buttons__Button'][class*='PaginationButton__StyledPaginationButton']"))
        )
        # actions.move_to_element(driver.find_element(By.CSS_SELECTOR, "button[class*='Buttons__Button'][class*='PaginationButton__StyledPaginationButton']")).click().perform()
        actions.move_to_element(show_more_button).click().perform()
        # print("Clicked 'Show More'", clicked, "times...")
        time.sleep(1)  # Wait for the content to load
        retries = 5  # Reset retries after a successful click
        clicked -= 1
    except Exception as e:
        print("Retry clicking 'Show More'...")
        time.sleep(1)
        retries -= 1
        if retries == 0:
            print("Final attempt failed. Proceeding with data extraction.")
            break

# Scrape the data
data = []
professor_cards = driver.find_elements(By.CSS_SELECTOR, "a[class*='TeacherCard__StyledTeacherCard']")
for card in professor_cards:
    name = card.find_elements(By.CSS_SELECTOR, "div[class*='CardName__StyledCardName']")[0].text.strip()
    rating = card.find_elements(By.CSS_SELECTOR, "div[class*='CardNumRating__CardNumRatingNumber']")[0].text.strip()
    department = card.find_elements(By.CSS_SELECTOR, "div[class*='CardSchool__Department']")[0].text.strip()
    difficulty = card.find_elements(By.CSS_SELECTOR, "div[class*='CardFeedback__CardFeedbackNumber']")[1].text.strip()
    num_ratings = card.find_elements(By.CSS_SELECTOR, "div[class*='CardNumRating__CardNumRatingCount']")[0].text.strip()

    data.append({
        "Professor Name": name,
        "Professor Rating": rating,
        "Professor Department": department,
        "Difficulty": difficulty,
        "Number of Ratings": num_ratings
    })

# Convert data to DataFrame
df = pd.DataFrame(data)

# Close the browser
driver.quit()

# Print the DataFrame
df

In [None]:
# Get only rows with unique professor names
df.drop_duplicates(subset="Professor Name", keep="first", inplace=True)

In [None]:
# Give appropriate data types to the columns
df['Professor Rating'] = df['Professor Rating'].astype(float)
df['Difficulty'] = df['Difficulty'].astype(float)
df['Professor Name'] = df['Professor Name'].str.title()
df['Professor Department'] = df['Professor Department'].str.title()

In [None]:
# Get mean rating by department
mean_rating_by_department = df.groupby('Professor Department')['Professor Rating'].mean().reset_index()
mean_rating_by_department = mean_rating_by_department.sort_values(by='Professor Rating', ascending=False)
mean_rating_by_department

In [None]:
# Get mean difficulty by department
mean_difficulty_by_department = df.groupby('Professor Department')['Difficulty'].mean().reset_index()
mean_difficulty_by_department = mean_difficulty_by_department.sort_values(by='Difficulty', ascending=False)
mean_difficulty_by_department

In [None]:
# create matrix of mean rating and difficulty by department
mean_rating_difficulty_by_department = mean_rating_by_department.merge(mean_difficulty_by_department, on='Professor Department')
mean_rating_difficulty_by_department