#  <span style="font-family: Latin Modern Roman; font-size: 35px; font-weight: bold;"> Práctica 3. Extracción y Almacenamiento de Datos</span>

---

In [1]:
import os
import time
import requests

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

## <span style="font-family: Latin Modern Roman; font-size: 25px;"> 1. Almacenar las 250 reseñas más relevantes de la película en *Rotten Tomatoes* en una base de datos. </span>

In [2]:
current_dir = os.getcwd()
user1_path = os.path.join(current_dir, "users_secrets/user.txt")
password1_path = os.path.join(current_dir, "users_secrets/password.txt")

with open(user1_path, "r") as file:
    USER = file.read().strip()
with open(password1_path, "r") as file:
    PASSWORD = file.read().strip()

MONGO_URI = f"mongodb+srv://{USER}:{PASSWORD}@movie-reviews.xewcx.mongodb.net/?retryWrites=true&w=majority&appName=movie-reviews"
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client["rotten-tomatoes"]
collection = db["conclave-reviews"]

In [3]:
def fetch_reviews_from_rt(movie_url, review_type, max_reviews=250):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(movie_url)
    time.sleep(3)

    reviews = []
    seen_reviews = set()
    prev_review_count = 0 

    while len(reviews) < max_reviews:
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        if review_type == "top_critics":
            top_critics = soup.find_all('div', class_='review-row')
            for critic in top_critics:
                critic_name_tag = critic.find('a', {'data-qa': 'review-critic-link'})
                publication_tag = critic.find('a', {'data-qa': 'review-publication'})
                review_text_tag = critic.find('p', class_='review-text')

                if critic_name_tag and publication_tag and review_text_tag:
                    critic_name = critic_name_tag.get_text(strip=True)
                    publication = publication_tag.get_text(strip=True)
                    review_text = review_text_tag.get_text(strip=True)

                    review_entry = (critic_name, publication, review_text)

                    if review_entry not in seen_reviews:
                        seen_reviews.add(review_entry)
                        reviews.append({
                            "type": "Top Critic",
                            "name": critic_name,
                            "publication": publication,
                            "review": review_text
                        })

                    if len(reviews) >= max_reviews:
                        break

            print(f"Fetched {len(reviews)} Top Critics reviews...")

        elif review_type == "verified_audience":
            verified_audience = soup.find_all('div', class_='audience-review-row')
            for person in verified_audience:
                person_name_tag = person.find('span', {'data-qa': 'review-name'})
                review_text_tag = person.find('p', {'data-qa': 'review-text'})

                if person_name_tag and review_text_tag:
                    person_name = person_name_tag.get_text(strip=True)
                    review_text = review_text_tag.get_text(strip=True)

                    review_entry = (person_name, review_text)

                    if review_entry not in seen_reviews:
                        seen_reviews.add(review_entry)
                        reviews.append({
                            "type": "Verified Audience",
                            "name": person_name,
                            "review": review_text
                        })

                    if len(reviews) >= max_reviews:
                        break
            print(f"Fetched {len(reviews)} Verified Audience reviews...")

        if len(reviews) == prev_review_count:
            print("No new reviews found. Exiting...")
            break
        prev_review_count = len(reviews)

        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, '[data-qa="load-more-btn"]')
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(2)
        except:
            print("No more reviews available. Stopping...")
            break 

    driver.quit()
    return reviews

In [4]:
def insert_reviews_into_mongodb(reviews, movie_name = "Conclave"):
    if not reviews:
        print("No reviews to insert into MongoDB.")
        return

    review_documents = [{"movie": movie_name, "review": review} for review in reviews]
    collection.insert_many(review_documents)

    print(f"Successfully inserted {len(reviews)} reviews into MongoDB")

In [5]:
top_critics_url = "https://www.rottentomatoes.com/m/conclave/reviews?type=top_critics"
top_critics_reviews = fetch_reviews_from_rt(top_critics_url, "top_critics",max_reviews = 250)
verified_audience_url = "https://www.rottentomatoes.com/m/conclave/reviews?type=verified_audience"
verified_audience_reviews = fetch_reviews_from_rt(verified_audience_url, "verified_audience", max_reviews = 250)

Fetched 20 Top Critics reviews...
Fetched 40 Top Critics reviews...
Fetched 60 Top Critics reviews...
Fetched 69 Top Critics reviews...
Fetched 69 Top Critics reviews...
No new reviews found. Exiting...
Fetched 20 Verified Audience reviews...
Fetched 40 Verified Audience reviews...
Fetched 60 Verified Audience reviews...
Fetched 80 Verified Audience reviews...
Fetched 100 Verified Audience reviews...
Fetched 120 Verified Audience reviews...
Fetched 140 Verified Audience reviews...
Fetched 160 Verified Audience reviews...
Fetched 180 Verified Audience reviews...
Fetched 200 Verified Audience reviews...
Fetched 220 Verified Audience reviews...
Fetched 240 Verified Audience reviews...
Fetched 250 Verified Audience reviews...


In [6]:
insert_reviews_into_mongodb(top_critics_reviews)
insert_reviews_into_mongodb(verified_audience_reviews)

Successfully inserted 69 reviews into MongoDB
Successfully inserted 250 reviews into MongoDB


---

## <span style="font-family: Latin Modern Roman; font-size: 25px;"> 2. Proponer 4 estrategias de marketing basadas en datos obtenidos de esas reseñas para incrementar la popularidad de la película. </span>

<div style="text-align: justify;">

- Qué actores llevar en función de cómo son percibidos por la audiencia, o mejor solos o juntos
- Hacer portadas distintas en instagram, tiktok, etc. en función de qué se valora más (acción, romance,...)
- Si hay polémica (quién es mejor, qué debería haber ocurrido) hacer engagement con los espectadores
- Audiencia verificada vs critics --> donde nos centramos, a unos les gusta más o menos?
- Critics hay zonas en las que triunfa más (según medio)--> anunciarnos en esos medios
- Evolución en el tiempo de las reviews de la película --> streaming ha afectado (nos abrimos a más plataformas?)
- wordcloud de las mejores notas, separar por notas....
</div>

---

## <span style="font-family: Latin Modern Roman; font-size: 25px;"> Elena Conderana Medem y Sergio Cuenca Núñez </span>