In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import random
import pandas as pd
import os

def get_trailer_and_views(movie_title, driver):
    try:
        search_url = f"https://www.youtube.com/results?search_query={movie_title}+Trailer"
        print(f"Navegando a: {search_url}")
        driver.get(search_url)
        
        print("Esperando a que los resultados de búsqueda se carguen...")
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '(//*[@id="thumbnail"])[1]')))
        print("Resultados de búsqueda cargados.")
        
        handle_cookie_popup(driver) # handle the pop-up that appears
        
        print("Esperando 1 segundos adicionales para asegurar la carga completa...")
        time.sleep(1)
        
        print("Obteniendo el conteo de vistas desde los resultados de búsqueda...")
        view_count_xpath = '//div[@id="metadata-line"]/span[contains(@class, "inline-metadata-item") and contains(text(), "views") and following-sibling::span[contains(text(), "years")]]'
        view_count_element = WebDriverWait(driver, 40).until(
            EC.presence_of_element_located((By.XPATH, view_count_xpath))
        )
        view_count_text = view_count_element.text
        view_count_match = re.search(r'(\d+(\.\d+)?[MK]?|\d{1,3}(,\d{3})*)( views)', view_count_text)
        if view_count_match:
            view_count = view_count_match.group(1)
        else:
            view_count = "N/A"
        print(f"Vistas: {view_count}")
        
        return f"{view_count}"

    except Exception as e:
        print(f"Ocurrió un error: {e}")
        return "Error"

def handle_cookie_popup(driver):
    try:
        print("Esperando al pop-up de consentimiento de cookies...")
        consent_popup = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//div[@id="content" and @class="style-scope ytd-consent-bump-v2-lightbox"]'))
        )
        print("Pop-up de consentimiento de cookies encontrado.")
        
        # make it scroll so it can click on it
        print("Desplazándose dentro del pop-up...")
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", consent_popup)
        print("Desplazamiento dentro del pop-up completo.")
        
        # click on it using the x path we found
        print("Ubicando el botón 'Aceptar todo'...")
        accept_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@class="yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m" and .//span[text()="Accept all"]]'))
        )
        print("Botón 'Aceptar todo' ubicado.")
        print("Haciendo clic en el botón 'Aceptar todo'...")
        accept_button.click()
        print("Botón 'Aceptar todo' clickeado.")
    except Exception as e:
        print("No se encontró el pop-up de consentimiento de cookies o no se pudo cerrar:", e)

def process_movie_titles(titles):
    driver_path = 'C:/Program Files/chromedriver/chromedriver-win64/chromedriver.exe'  # Replace with your chromedriver.exe

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--start-maximized")
    
    # put user agent here
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36; Sofia Villamil / sofia.v1999@gmail.com'    
    ]
    
    results = []
    
    for title in titles:
        user_agent = random.choice(user_agents)
        chrome_options.add_argument(f'user-agent={user_agent}')
        
        # Initiate the chrome driver
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        try:
            view_count = get_trailer_and_views(title, driver)
            results.append((title, view_count))
            
            # Save progress after each title to ensure data is not lost
            results_df = pd.DataFrame(results, columns=['title', 'view_count'])
            results_df.to_csv('Youtube and Instagram Data/data_final_movies_with_views.csv', index=False)
            print(f"Progreso guardado para: {title}")
                
            time.sleep(random.uniform(5, 10))  # Delay between requests to mimic human behavior
            
        finally:
            driver.quit()
    
    return results

# Input file
input_file_path = r'Youtube and Instagram Data/data_final_movies.csv'
df = pd.read_csv(input_file_path)

titles = df['title'].tolist()

# Select only the first 10 titles to scrape
titles = titles[:10]
results = process_movie_titles(titles)

results_df = pd.DataFrame(results, columns=['title', 'view_count'])

# Save the data  
output_file_path = r'Youtube and Instagram Data/data_final_movies_with_views.csv'
results_df.to_csv(output_file_path, index=False)

print(f"Proceso completado. Resultados guardados en {output_file_path}")
