### Web scraping code for Metacritic "Movies of All Time" (Top 1000 films)

In [184]:
# The website URL: https://www.metacritic.com/

from selenium import webdriver
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.common.exceptions import TimeoutException 
from selenium.common.exceptions import NoSuchElementException 
from selenium.common.exceptions import ElementClickInterceptedException 
from time import sleep
from tqdm import tqdm, trange
from datetime import datetime
from bs4 import BeautifulSoup 
import requests 
import pandas as pd

In [185]:
s = Service("C:/Users/Владислав/Desktop/ChromeDriver/chromedriver.exe")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-notifications")
#chrome_options.add_argument("--headless") # This is an option for those people, who want to scrape info with the 
# possibility of invisible browser
browser = webdriver.Chrome(service = s, options = chrome_options)
browser.maximize_window()
browser.get("https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc")
browser.implicitly_wait(10)

start = datetime.now() # It might be interesting to measure the time of scraping the URL_links

page = browser.page_source
soup = BeautifulSoup(page)

all_URL_links = []

# To scrape all the URL_links we are going to use Selenium and BeautifulSoup
# Selenium will help to click the next button in order to change the page of the movie rating 
# Simultaneously each page of Metacritic will be analyzed by Requests library and BeautifulSoup for extracting 
# relevant URL_links

for link in soup.find_all("td", class_ = "clamp-summary-wrap"):
    all_URL_links.append("https://www.metacritic.com/" + link.find_all("a")[1].get("href"))

while True:
    try:
        WebDriverWait(browser, 30).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,
                                        "#main_content > div.browse.movie.new_releases > div.content_after_header > div > div.next_to_side_col > div > div.marg_top1 > div > div > div.page_flipper > span.flipper.next > a"))
        )
        next_page = browser.find_element(By.CSS_SELECTOR,
                                         "#main_content > div.browse.movie.new_releases > div.content_after_header > div > div.next_to_side_col > div > div.marg_top1 > div > div > div.page_flipper > span.flipper.next > a")
        next_page.click()
        sleep(3)
        
        page = browser.page_source
        soup = BeautifulSoup(page)
        for link in soup.find_all("td", class_ = "clamp-summary-wrap"):
            all_URL_links.append("https://www.metacritic.com/" + link.find_all("a")[1].get("href"))

    except TimeoutException:
        print("Timeout!")
        break
    except NoSuchElementException:
        print("NoSuchElement!")
        break
    except ElementClickInterceptedException:
        print("ElementClickIntercepted!")
        break 

end = datetime.now() - start # The code will show the final time of scraping the URL_links
print(f"All links were successfully scraped for the following time {end}.")
print(f"The total number of URL_links are {len(all_URL_links)}.")
all_URL_links[:20] # The slice will reveal the first 20 links

Timeout!
All links were successfully scraped for the following time 0:36:34.583635.
The total number of URL_links are 14795.


['https://www.metacritic.com//movie/the-godfather',
 'https://www.metacritic.com//movie/citizen-kane',
 'https://www.metacritic.com//movie/rear-window',
 'https://www.metacritic.com//movie/casablanca',
 'https://www.metacritic.com//movie/boyhood',
 'https://www.metacritic.com//movie/three-colors-red',
 'https://www.metacritic.com//movie/vertigo',
 'https://www.metacritic.com//movie/notorious-1946',
 'https://www.metacritic.com//movie/singin-in-the-rain',
 'https://www.metacritic.com//movie/city-lights',
 'https://www.metacritic.com//movie/moonlight-2016',
 'https://www.metacritic.com//movie/intolerance',
 'https://www.metacritic.com//movie/pinocchio-1940',
 'https://www.metacritic.com//movie/touch-of-evil',
 'https://www.metacritic.com//movie/the-lady-vanishes-1938',
 'https://www.metacritic.com//movie/the-treasure-of-the-sierra-madre',
 'https://www.metacritic.com//movie/pans-labyrinth',
 'https://www.metacritic.com//movie/some-like-it-hot',
 'https://www.metacritic.com//movie/north-b

In [279]:
# In total, 14795 URL_links (for 14795 films respectively) were scraped
# As it is going to take a lot of time to extract all the relevent information through each link, 
# it might be better to limit the scraping scale (for example, we can scrape the info for the first best 1000 films)

Film_titles = [] # The list will store film titles
MetaScore = [] # The list will store scores/rating on behalf of critics
UserScore = [] # The list will store scores/rating on behalf of users
Release_year = [] # The list will store film production years
Film_genre = [] # The list will store film genres
Film_directors = [] # The list will store names of people, who directed films
Runtime = [] # The list will store the film duration (in minutes)

# As it is highly possible that some errors/exceptions may be encountered, the extended slice will be taken (not 1000 films but 1200 films)

for link in tqdm(all_URL_links[:1200]):
    url = link
    try:
        myPage = requests.get(url, headers = headers, allow_redirects = False) # The code to avoid multiple redirections
        soup = BeautifulSoup(myPage.content, "html.parser")
        title = soup.h1.get_text().strip()
        Film_titles.append(title) # Film titles
    except AttributeError:
        Film_titles.append(None) 
        
    try:
        meta_score = soup.find_all("a", class_ = "metascore_anchor")[0].get_text().strip()
        MetaScore.append(meta_score) # Score on behalf of critics
    except IndexError:
        MetaScore.append(None) 
        
    try:
        user_score = soup.find_all("a", class_ = "metascore_anchor")[1].get_text().strip()
        UserScore.append(user_score) # Score on behalf of users
    except IndexError:
        UserScore.append(None)
        
    try: 
        year = soup.find_all("span", class_ = "release_year lighter")[0].get_text().strip()
        Release_year.append(year) # Production year
    except IndexError:
        Release_year.append(None)
        
    try: 
        genre = soup.find_all("div", class_ = "genres")[0].get_text().strip()
        # In this case the input info should be cleaned:
        genre = genre.replace("Genre(s):\n\n", "").replace(" ", "").split(",")[0] # As there are different genres for each film,
        # for better understanding it will be much more convenient to pick out only one, the very first genre
        Film_genre.append(genre) # Film genre
    except IndexError:
        Film_genre.append(None)
        
    try: 
        director = soup.find_all("div", class_ = "director")[0].get_text().strip()
        # In this case the input info should be cleaned:
        director = director.replace("Director:\n", "")
        Film_directors.append(director) # Film directors
    except IndexError:
        Film_directors.append(None)
        
    try: 
        duration = soup.find_all("div", class_ = "runtime")[0].get_text().strip()
        # In this case the input info should be cleaned:
        duration = duration.replace("Runtime:\n", "").replace(" min", "")
        Runtime.append(duration) # Film duration
    except IndexError:
        Runtime.append(None)
        
# Progress bar (tqdm) shows the time of scraping all the info in different categories

100%|██████████████████████████████████████████████████████████████████████████████| 1200/1200 [16:11<00:00,  1.24it/s]


In [280]:
aggregation_list = list(zip(Film_titles, Film_genre, Release_year, MetaScore, UserScore, Film_directors, Runtime))
# To create the DataFrame, first of all, we merge different-category elements as a zip_list
aggregation_list[:5]

[('The Godfather',
  'Drama',
  '1972',
  '100',
  '9.2',
  'Francis Ford Coppola',
  '175'),
 (None, None, None, None, None, None, None),
 ('Rear Window', 'Mystery', '1954', '100', '8.7', 'Alfred Hitchcock', '112'),
 ('Casablanca', 'Drama', '1943', '100', '8.8', 'Michael Curtiz', '102'),
 ('Boyhood', 'Drama', '2014', '100', '7.4', 'Richard Linklater', '165')]

In [281]:
with open(r"Metacritic Dataset - Top 1000 movies.csv", mode = "w", encoding="utf-8") as file:
    for a, b, c, d, e, f, g in aggregation_list:
        try:
            file.write(a + ";" + b + ";" + c + ";" + d + ";" + e + ";" + f + ";" + g + "\n")
        except TypeError: # Is is necessary for eliminating None values
            pass

# The column names are the following:        
columns = ["Title", "Genre", "Year", "Metascore", "Userscore", "Director","Duration"]

df = pd.read_csv("Metacritic Dataset - Top 1000 movies.csv", delimiter = ";", names = columns)
df

Unnamed: 0,Title,Genre,Year,Metascore,Userscore,Director,Duration
0,The Godfather,Drama,1972.0,100,9.2,Francis Ford Coppola,175
1,Rear Window,Mystery,1954.0,100,8.7,Alfred Hitchcock,112
2,Casablanca,Drama,1943.0,100,8.8,Michael Curtiz,102
3,Boyhood,Drama,2014.0,100,7.4,Richard Linklater,165
4,Three Colors: Red,Drama,1994.0,100,8.5,Krzysztof Kieslowski,99
...,...,...,...,...,...,...,...
1142,Isle of Dogs,Adventure,2018.0,82,8.0,Wes Anderson,101
1143,Derek DelGaudio’s In & Of Itself,Documentary,2021.0,82,7.9,Frank Oz,90
1144,Mountain,Documentary,2018.0,82,5.5,Jennifer Peedom,74
1145,Certain Women,Drama,2016.0,82,6.1,Kelly Reichardt,107


In [286]:
# Some data cleaning is needed:
# 1 - Some None values still remain in the DataFrame
# 2 - 'Year' column consists of float numbers, not integers, let's correct the data type
# 3 - Some Userscores are unknown - "tbd" meaning "To be determined", so it is necessary to exclude such values as well

df = df.dropna() # Removal of None values
df["Year"] = df["Year"].apply(int) # Data type alteration
df.drop(df[df["Userscore"] == "tbd"].index, inplace = True) # Removal of "tbd" values
df["Userscore"] = df["Userscore"].apply(float) # Data type alteration (object --> float)

In [284]:
df

Unnamed: 0,Title,Genre,Year,Metascore,Userscore,Director,Duration
0,The Godfather,Drama,1972,100,9.2,Francis Ford Coppola,175
1,Rear Window,Mystery,1954,100,8.7,Alfred Hitchcock,112
2,Casablanca,Drama,1943,100,8.8,Michael Curtiz,102
3,Boyhood,Drama,2014,100,7.4,Richard Linklater,165
4,Three Colors: Red,Drama,1994,100,8.5,Krzysztof Kieslowski,99
...,...,...,...,...,...,...,...
1141,My Summer of Love,Drama,2005,82,8.8,Pawel Pawlikowski,86
1142,Isle of Dogs,Adventure,2018,82,8.0,Wes Anderson,101
1143,Derek DelGaudio’s In & Of Itself,Documentary,2021,82,7.9,Frank Oz,90
1144,Mountain,Documentary,2018,82,5.5,Jennifer Peedom,74


In [287]:
df.dtypes
# All data types are adequate now

Title         object
Genre         object
Year           int64
Metascore      int64
Userscore    float64
Director      object
Duration       int64
dtype: object

In [288]:
# The volume of DataFrame will be 1000 rows (Top 1000 films)
df = df.head(1000)
df

Unnamed: 0,Title,Genre,Year,Metascore,Userscore,Director,Duration
0,The Godfather,Drama,1972,100,9.2,Francis Ford Coppola,175
1,Rear Window,Mystery,1954,100,8.7,Alfred Hitchcock,112
2,Casablanca,Drama,1943,100,8.8,Michael Curtiz,102
3,Boyhood,Drama,2014,100,7.4,Richard Linklater,165
4,Three Colors: Red,Drama,1994,100,8.5,Krzysztof Kieslowski,99
...,...,...,...,...,...,...,...
1119,Sorry We Missed You,Drama,2020,82,8.0,Ken Loach,101
1121,Down and Out in Beverly Hills,Comedy,1986,82,7.7,Paul Mazursky,103
1122,Steve Jobs,Biography,2015,82,7.1,Danny Boyle,122
1123,Marley,Biography,2012,82,8.0,Kevin Macdonald,144


In [289]:
# The final step is to preserve the amended and cleaned DataFrame:
df.to_csv("Metacritic Dataset - Top 1000 movies.csv")