In [33]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import glob
import pandas as pd
import time
import requests
import lxml.html

from collections import defaultdict
from tqdm import tqdm
from bs4 import BeautifulSoup
from lxml import etree

In [19]:
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--window-size=1420,1080')
# options.add_argument('--headless')
options.add_argument('--disable-gpu')

driver = webdriver.Chrome(options=options)

In [15]:
all_files = glob.glob("train*.csv")

print(f"Found files: {', '.join(all_files)}")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True).drop("Unnamed: 0", axis = 1)

Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv


Commented-out cell is a version using selenium, which is rather slow

In [None]:
# for movie in tqdm(df["tconst"]):
#     driver.get(f"https://www.imdb.com/title/{movie}/criticreviews")
    
#     try:
#         driver.find_element_by_xpath("//a[contains(@href, 'metacritic.com/movie')]").click()
#     except NoSuchElementException:
#         continue
    
#     driver.switch_to.window(driver.window_handles[1])
#     try:
#         driver.find_element_by_xpath("//button[@id = 'onetrust-accept-btn-handler']").click()
#         time.sleep(0.5)
#     except NoSuchElementException:
#         pass
    
#     driver.find_element_by_xpath("//a[contains(@href, 'details')]").click()
    
#     data["movie"].append(movie)
#     data["genres"].append(driver.find_element_by_xpath("//tr[@class = 'genres']").text)
#     data["language"].append(driver.find_element_by_xpath("//tr[@class = 'languages']").text)
#     data["overview"].append(driver.find_element_by_xpath("//div[@class = 'summary']").text)

#     driver.close()
#     driver.switch_to.window(driver.window_handles[0])    

In [98]:
data = defaultdict(list)

htmlparser = etree.HTMLParser()

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

for movie in tqdm(df["tconst"]):
    # Find movie on IMDB
    response = requests.get(f"https://www.imdb.com/title/{movie}/criticreviews", stream=True)
    response.raw.decode_content = True
        
    tree = lxml.html.parse(response.raw)
    
    # Find the metacritic link
    metacritic = tree.xpath("//a[contains(@href, 'metacritic.com/movie')]/@href")
    
    # If metacritic link is present
    if metacritic:
        link = metacritic[0].split("?")[0]

        # Go to the details
        response3 = requests.get(f"{link}/details", 
                                 stream=True, 
                                 headers=headers)
        
        response3.raw.decode_content = True
        tree3 = lxml.html.parse(response3.raw)
        
        # Store relevant data
        data["movie"].append(movie)
        data["genres"].append(tree3.xpath("//tr[@class = 'genres']/td[@class = 'data']/span/text()"))
        data["language"].append(tree3.xpath("//tr[@class = 'languages']/td[@class = 'data']/span/text()"))
        data["overview"].append(tree3.xpath("//div[@class = 'summary']/span[not(@class)]/text()"))    

100%|████████████████████████████████████| 7959/7959 [2:08:47<00:00,  1.03it/s]


In [101]:
pd.DataFrame(data).to_csv("Metacritic.csv")