# Webscrapping using Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
#Configurando o Chrome para o acesso
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--lang=en-US')

#Acessando o Driver
driver = webdriver.Chrome(chrome_options)
driver.get("https://www.imdb.com")

In [8]:
menu = driver.find_element(By.XPATH,r'//*[@id="imdbHeader-navDrawerOpen"]') 
menu.click()

In [9]:
link_populares = driver.find_element(By.LINK_TEXT, r'Top 250 Movies')
url = link_populares.get_attribute("href")
driver.get(url)

tag_ul = driver.find_element(By.XPATH,r'//*[@id="__next"]/main/div/div[3]/section/div/div[2]/div/ul')
movies_list = tag_ul.find_elements(By.TAG_NAME,"li")

In [10]:
from dataclasses import dataclass

@dataclass
class Movie():
    title: str
    year: str
    rating: float
    movie_page_link: str
    poster_image_link: str
    genders: list
    popularity: str
    directors: list

In [11]:
import re

def populate_movie(imdb_li_tag):

    title = imdb_li_tag.find_element(By.CLASS_NAME, "ipc-title__text").text #pegamos o texto do elemento

    year = imdb_li_tag.text.split("\n")[1]
    
    #Do jeito que está os ratings gerados sempre vem como None para cada filme
    #Como os dados estão sendo recebidos em forma de texto, estamos buscando apenas do início até a posição
    #2 da string visto que não temos nenhum filme com nota 10.0
    rating_text = imdb_li_tag.find_element(By.CLASS_NAME, "ipc-rating-star").text[:3] #este texto está 'sujo', contém o número de votos também
    
    movie_page_link = imdb_li_tag.find_element(By.CLASS_NAME, "ipc-title-link-wrapper").get_attribute("href")

    poster_image_link = imdb_li_tag.find_element(By.CLASS_NAME, "ipc-image").get_attribute("src")

    #return Movie(title,year,rating,movie_page_link)
    return Movie(title,year,rating_text,movie_page_link,poster_image_link,None,None,None)

In [None]:
populated_movies_list = []

for movie_tag in movies_list:
    movie = populate_movie(movie_tag)
    populated_movies_list.append(movie)

In [16]:
for movie in populated_movies_list:
    driver.get(movie.movie_page_link)
    try:
        movie.genders = driver.find_element(By.CLASS_NAME, "ipc-chip-list__scroller").text.split("\n")
    except:
        pass
    try:
        movie.popularity = driver.find_element(By.XPATH,'//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[3]/a/span/div/div[2]/div[1]').text
    except:
        pass
    try:
        movie.directors = driver.find_element(By.CLASS_NAME, "ipc-metadata-list__item").text.split("\n")[1:]
    except:
        pass

In [17]:
import json

with open('movies.json', 'w') as file:
 for movie in populated_movies_list:
   json.dump(movie.__dict__, file, ensure_ascii=False, indent=2) #.__dict__ converte um filme em uma estrutura do tipo Dicionário

In [None]:
driver.quit()