In [1]:
# Loading all the libraries needed for running the code chunks below

import json
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException


In [9]:
def parse_movie(link):   
    
    movie_data = {'title':'','year':'', 'genre':'', 'summary':'', 'starring':'', 'director':'', 'runtime':''}
    driver.get(link)
    driver.implicitly_wait(1)
    html = driver.page_source
    titles = []
    
    try:
        title = driver.find_element(By.TAG_NAME, 'h1')
        movie_data['title'] = title.text
        titles.append(title.text)
    except NoSuchElementException:
        pass 
    
    try:
        year = driver.find_element(By.CLASS_NAME, 'release_year')
        movie_data['year'] = year.text
    except NoSuchElementException:
        pass 
     
    try:
        genres = driver.find_element(By.CLASS_NAME, 'genres')
        genres = genres.find_elements(By.TAG_NAME, 'span')
        genres_list = []
        for i in range(2,len(genres)):
            genres_list.append(genres[i].text)
        movie_data['genre'] = genres_list
    except NoSuchElementException:
        return '',''

    summary = driver.find_element(By.CLASS_NAME, 'summary_deck')

    try:
        button = driver.find_element(By.ID, 'onetrust-reject-all-handler')
        button.click()
    except NoSuchElementException:
        pass
   
    try:
        expand_button = summary.find_element(By.CLASS_NAME, 'toggle_expand_collapse')
        expand_button.click()
        summary = summary.find_elements(By.CLASS_NAME, 'blurb')
        movie_data['summary'] = summary[1].text
    except ElementClickInterceptedException:
        summary = driver.find_element(By.CLASS_NAME, 'summary_deck')
        summary = summary.find_elements(By.TAG_NAME, 'span')
        movie_data['summary'] = summary[1].text
    except NoSuchElementException:
        summary = driver.find_element(By.CLASS_NAME, 'summary_deck')
        summary = summary.find_elements(By.TAG_NAME, 'span')
        movie_data['summary'] = summary[1].text

    try:
        cast = driver.find_element(By.CLASS_NAME, 'summary_cast')
        cast = cast.find_elements(By.TAG_NAME, 'a')
        cast_list = []
        for i in range(len(cast)):
            cast_list.append(cast[i].text)
        movie_data['starring'] = cast_list
    except NoSuchElementException:
        pass    
    
    try:
        director = driver.find_element(By.CLASS_NAME, 'director')
        director = director.find_elements(By.TAG_NAME, 'span')
        movie_data['director'] = director[1].text
    except NoSuchElementException:
        pass
    
    try:
        runtime = driver.find_element(By.CLASS_NAME, 'runtime')
        runtime = runtime.find_elements(By.TAG_NAME, 'span')
        movie_data['runtime'] = runtime[1].text
    except NoSuchElementException:
        pass
    
    return movie_data, titles

In [12]:
# Main code chunk

driver = webdriver.Chrome('chromedriver')
main_link = 'https://www.metacritic.com/movie/'

with open('../scripts/new_names/movie_titles.txt', 'r') as f:
    movie_titles = [line.strip() for line in f]

urls = ["https://www.metacritic.com/movie/" + title for title in movie_titles]

start = time.time()

data = []
titles = []
for link in urls:
    data_l, titles_l = parse_movie(link)
    if titles_l == '':
        continue
    data.append(data_l)
    titles.append(titles_l)

end = time.time()
print('Parsing time:', end-start)
driver.close()

with open("metacritic_data2.json", "w", encoding='utf-8') as jsonfile:
    json.dump(data, jsonfile, ensure_ascii=False, indent = 2)
    
with open('titles2.txt', 'w') as f:
    f.write(str(titles))

Parsing time: 5655.88806271553


In [13]:
with open("titles2.csv", "w", newline="", encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(titles)

In [14]:
print(len(titles))

965


In [15]:
print(len(urls))

1126
