#  <span style="font-family: Latin Modern Roman; font-size: 35px; font-weight: bold;"> Práctica 3. Extracción y Almacenamiento de Datos</span>

---

In [1]:
import os
import time
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

## <span style="font-family: Latin Modern Roman; font-size: 25px;"> 1. Almacenar las 250 reseñas más relevantes de la película en *Rotten Tomatoes* en una base de datos. </span>

In [None]:
current_dir = os.getcwd()
user1_path = os.path.join(current_dir, "users_secrets/user.txt")
password1_path = os.path.join(current_dir, "users_secrets/password.txt")

with open(user1_path, "r") as file:
    USER = file.read().strip()
with open(password1_path, "r") as file:
    PASSWORD = file.read().strip()

MONGO_URI = f"mongodb+srv://{USER}:{PASSWORD}@movie-reviews.xewcx.mongodb.net/?retryWrites=true&w=majority&appName=movie-reviews"
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client["rotten-tomatoes"]
collection = db["dune2-reviews"]

In [7]:
def fetch_reviews_from_rt(movie_url, max_reviews=250):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")
    options.add_argument("--disable-software-rasterizer")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(movie_url)
    time.sleep(3)

    reviews = set()

    while len(reviews) < max_reviews:
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        reviews_divs = soup.find_all('p', class_='review-text')
        for review_div in reviews_divs:
            review_text = review_div.get_text(strip=True)
            reviews.add(review_text)

            if len(reviews) >= max_reviews:
                break

        print(f"Fetched {len(reviews)} reviews")

        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, '[data-qa="load-more-btn"]')
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(2)
        except:
            print("No more reviews available.")
            break

    driver.quit()
    return list(reviews)

In [8]:
def insert_reviews_into_mongodb(reviews, movie_name = "Dune: Part Two"):
    if not reviews:
        print("No reviews to insert into MongoDB.")
        return

    review_documents = [{"movie": movie_name, "review": review} for review in reviews]
    collection.insert_many(review_documents)

    print(f"Successfully inserted {len(reviews)} reviews into MongoDB")

In [None]:
rt_url = "https://www.rottentomatoes.com/m/dune_part_two/reviews"
dune_part_two_reviews = fetch_reviews_from_rt(rt_url, max_reviews = 250)
insert_reviews_into_mongodb(dune_part_two_reviews)

Fetched 20 reviews
Fetched 40 reviews
Fetched 60 reviews
Fetched 80 reviews
Fetched 100 reviews
Fetched 120 reviews
Fetched 140 reviews
Fetched 160 reviews
Fetched 180 reviews
Fetched 200 reviews
Fetched 220 reviews
Fetched 240 reviews
Fetched 250 reviews


In [9]:
insert_reviews_into_mongodb(dune_part_two_reviews)

Successfully inserted 250 reviews into MongoDB


---

## <span style="font-family: Latin Modern Roman; font-size: 25px;"> Elena Conderana Medem y Sergio Cuenca Núñez </span>