<h1> Script to scrape CRITIC reviews from Rotten Tomatoes </h1>

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.common.exceptions import TimeoutException

def critic_reviews_for_movies(movie_names, num_reviews=100, num_pages=5):
    all_reviews = {}

    for movie_name in movie_names:
        url = f"https://www.rottentomatoes.com/m/{movie_name}/reviews"
        reviews_df = scrape_rotten_tomatoes_reviews(url, num_reviews=num_reviews, num_pages=num_pages)
        all_reviews[movie_name] = reviews_df

    return all_reviews

def scrape_rotten_tomatoes_reviews(url, num_reviews=100, num_pages=5):
    driver = webdriver.Chrome()
    driver.get(url)

    reviews_set = set()
    reviews_df = pd.DataFrame(columns=['Review'])
    total_reviews = 0

    for _ in range(num_pages):
        try:
            # Wait for the reviews to load
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'review_table')))
        except TimeoutException:
            # Handle the case where the 'review_table' is not found (e.g., not enough reviews)
            print(f"Movie does not have enough reviews on the page. Stopping scraping for {url}")
            break

        # Parse the page content
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        review_table = soup.find("div", class_="review_table")
        review_row = review_table.find_all("div", class_="review-row")

        for r in review_row:
            review_text = r.find("p", class_="review-text").text

            # Check if the review is not in the set (i.e., it's unique)
            if review_text not in reviews_set:
                reviews_set.add(review_text)
                reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)

        # Check if the total number of reviews has reached the target
        if len(reviews_set) >= num_reviews:
            break

        try:
            # Click the 'Load More' button to load more reviews
            load_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//rt-button[@data-qa="load-more-btn"]')))
            load_more_button.click()
        except TimeoutException:
            # Handle the case where 'load-more-btn' is not found (e.g., no more reviews to load)
            print(f"No more reviews to load for {url}")
            break

        # Add a delay to wait for the new reviews to load
        time.sleep(3)

    driver.quit()

    # Return the DataFrame
    return reviews_df


In [2]:
movie_names = ["mister_organ", "family_stone"] #just for testing

In [29]:
'''movie_names = [
    "oppenheimer_2023", "top_gun_maverick", "the_banshees_of_inisherin", "everything_everywhere_all_at_once", "turning_red",
    "happening", "the_batman", "fire_of_love", "marcel_the_shell_with_shoes_on", "till", "the_woman_king", "prey_2022",
    "the_northman", "x_2022", "the_duke", "no_bears", "navalny", "official_competition", "good_luck_to_you_leo_grande",
    "indiana_jones_and_the_dial_of_destiny", "past_lives", "love_actually", "the_exorcist_believer", "barbie",
    "killers_of_the_flower_moon", "thanksgiving_2023", "dream_scenario", "the_marvels", "poor_things",
    "how_the_grinch_stole_christmas", "the_wonderful_story_of_henry_sugar", "the_equalizer_3", "mad_max_fury_road",
    "hunter_killer", "no_hard_feelings_2023", "best_christmas_ever", "spider_man_across_the_spider_verse", "saltburn",
    "napoleon_2023", "the_holdovers", "the_hunger_games_the_ballad_of_songbirds_and_snakes", "wish_2023",
    "anatomy_of_a_fall", "uncharted_2022", "blackberry", "the_hunger_games_mockingjay_part_2",
    "captain_underpants_the_first_epic_movie", "marvels_the_avengers", "guy_ritchies_the_covenant", "carol",
    "john_wick_chapter_4", "better_watch_out", "avengers_endgame", "dune_2021", "the_suicide_squad", "cat_person",
    "five_nights_at_freddys", "the_hunger_games", "mission_impossible_dead_reckoning_part_one", "violent_night",
    "dumb_money", "talk_to_me_2023", "past_lives", "dune_2021", "the_holiday", "war_for_the_planet_of_the_apes",
    "four_christmases", "interstellar_2014", "blade_runner_2049", "elemental_2023", "godzilla_vs_kong", "lucy_2014",
    "influencer", "aftersun", "a_disturbance_in_the_force", "dungeons_and_dragons_honor_among_thieves",
    "ant_man_and_the_wasp_quantumania", "rustin_2023", "pig_2021", "bullet_train_2022", "thor_love_and_thunder",
    "please_dont_destroy_the_treasure_of_foggy_mountain", "leave_the_world_behind_2023", "cocaine_bear",
    "candyman_2021", "a_man_called_otto", "wind_river_2017", "a_haunting_in_venice", "reptile_2023", "godzilla_resurgence",
    "triple_frontier", "heart_of_stone_2023", "fingernails", "die_hard", "butchers_crossing", "superman_man_of_steel",
    "transformers_rise_of_the_beasts", "batman_v_superman_dawn_of_justice", "what_happens_later", "mister_organ",
    "family_stone", "relax_im_from_the_future"
]'''

In [3]:
all_critic_reviews = critic_reviews_for_movies(movie_names, num_reviews=100, num_pages=5)

  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)

  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)


No more reviews to load for https://www.rottentomatoes.com/m/mister_organ/reviews


  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)

  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)

  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)
  reviews_df = reviews_df.append({'Review': review_text}, ignore_index=True)

In [4]:
print(all_critic_reviews)

{'mister_organ':                                                Review
0   Farrier's gutsy exploration and ability to dre...
1   The enthralling mystery at the heart of Mister...
2   Mister Organ is the kind of documentary one ne...
3                       An unhinged character study. 
4   It feels taxing in some way to even watch the ...
5   This is someone who sucks the life out of you,...
6   David Farrier’s latest documentary Mister Orga...
7   It’s a gripping yarn fuelled by larger than li...
8   Another New Zealand-based exploration of weird...
9    It's less an examination of the psyche of one...
10  Mister Organ is a chilling work that will have...
11  At the risk of his sanity, Farrier has bottled...
12  Farrier doesn’t offer a tidy conclusion or man...
13  Mister Organ is unhinged chaos; Michael Organ ...
14  Farrier eloquently captures and transports the...
15  It's the kind of story that might need a trigg...
16  It’s a long, hard ninety minutes, even with so...
17  Mister 

In [6]:
import os

output_folder = '/Users/kajolgajjar/Downloads/'
output_file_path = os.path.join(output_folder, 'CRITIC_reviews.csv')

# Concatenate all DataFrames into a single DataFrame
all_critic_reviews_df = pd.concat(all_critic_reviews.values(), keys=all_critic_reviews.keys())

# Save the concatenated DataFrame to a CSV file
all_critic_reviews_df.to_csv(output_file_path, index=False, sep=',')

print(f"\nSampled data saved to {output_file_path}")


Sampled data saved to /Users/kajolgajjar/Downloads/CRITIC_reviews.csv
