In [28]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from langdetect import detect
import time
import csv
import random

# Setup Chrome in headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of movie URLs to scrape
movie_urls = [
    "https://www.metacritic.com/movie/inception/user-reviews",
    "https://www.metacritic.com/movie/interstellar/user-reviews",
    "https://www.metacritic.com/movie/the-dark-knight/user-reviews",
    "https://www.metacritic.com/movie/avatar/user-reviews",
    "https://www.metacritic.com/movie/titanic/user-reviews"
]

def scrape_reviews(url, max_pages=5):
    """
    Scrape user reviews from a given Metacritic movie URL.
    
    Args:
        url (str): The URL of the movie's user reviews page.
        max_pages (int): Maximum number of pages to scrape.
    
    Returns:
        list: List of tuples containing (Rating, Review Text, Sentiment).
    """
    positive_reviews = []
    negative_reviews = []
    page = 0

    while (len(positive_reviews) < 7500 or len(negative_reviews) < 7500) and page < max_pages:
        driver.get(f"{url}?page={page}")

        try:
            # Wait until reviews are loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='product-review']"))
            )
        except:
            print(f"No reviews found on page {page}. Stopping.")
            break

        # Scroll down to load dynamic content
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Allow content to load
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract review elements
        review_containers = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='product-review']")
        
        for review in review_containers:
            try:
                # Extract review text from the correct class
                review_text_element = review.find_element(By.CSS_SELECTOR, "div.c-siteReview_quote span")
                review_text = review_text_element.text.strip()

                # Extract rating
                rating_element = review.find_element(By.CSS_SELECTOR, "div.c-siteReviewHeader_reviewScore")
                rating = rating_element.text.strip()

                # Ensure review is in English
                if review_text and detect(review_text) == "en":
                    sentiment = "Positive" if int(rating) >= 6 else "Negative"
                    
                    # Balance positive and negative reviews
                    if sentiment == "Positive" and len(positive_reviews) < 7500:
                        positive_reviews.append((rating, review_text, sentiment))
                    elif sentiment == "Negative" and len(negative_reviews) < 7500:
                        negative_reviews.append((rating, review_text, sentiment))

            except Exception as e:
                continue  # Skip errors and continue scraping

        page += 1

    return positive_reviews + negative_reviews

# **Scraping reviews from multiple movies**
final_reviews = []
random.shuffle(movie_urls)  # Shuffle URLs for diversity

for movie_url in movie_urls:
    print(f"Scraping: {movie_url}")
    final_reviews.extend(scrape_reviews(movie_url))
    print(f"Total reviews collected so far: {len(final_reviews)}")
    if len(final_reviews) >= 15000:
        break

# **Display a sample of extracted reviews**
print("\n--- Sample Extracted Reviews ---\n")
for idx, (rating, review, sentiment) in enumerate(final_reviews[:10]):
    print(f"Review {idx + 1}: [Rating: {rating}] [{sentiment}] {review}")
    print("-" * 100)

# **Close WebDriver**
driver.quit()


Scraping: https://www.metacritic.com/movie/interstellar/user-reviews
Total reviews collected so far: 4635
Scraping: https://www.metacritic.com/movie/inception/user-reviews
Total reviews collected so far: 8003
Scraping: https://www.metacritic.com/movie/avatar/user-reviews
Total reviews collected so far: 10083
Scraping: https://www.metacritic.com/movie/the-dark-knight/user-reviews
Total reviews collected so far: 13542
Scraping: https://www.metacritic.com/movie/titanic/user-reviews
Total reviews collected so far: 14552

--- Sample Extracted Reviews ---

Review 1: [Rating: 9] [Positive] One of my all time favorite movies. Definitely recommend it if you have not yet seen it. 10/10
----------------------------------------------------------------------------------------------------
Review 2: [Rating: 10] [Positive] This is by far my favorite movie the visuals and the acting is amazing. The story is amazing it felt simple while if you think more about the science and math involved in the simpl

In [30]:
# **Save extracted reviews to CSV**
csv_filename = "metacritic_reviews.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Rating", "Review", "Sentiment"])  # Correct column headers
    writer.writerows(final_reviews)

print(f"\n✅ Successfully extracted and saved 15,000 reviews to {csv_filename}!")


✅ Successfully extracted and saved 15,000 reviews to metacritic_reviews.csv!
