### Web scraping code for IMDb 'Top 250 movies'

In [27]:
# The website URL: https://www.imdb.com/?ref_=nv_home

from selenium import webdriver 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException 
from selenium.common.exceptions import NoSuchElementException 
from selenium.common.exceptions import ElementClickInterceptedException 
import requests 
from bs4 import BeautifulSoup
from time import sleep

In [28]:
s = Service("C:/Users/Владислав/Desktop/ChromeDriver/chromedriver.exe")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-notifications")
#chrome_options.add_argument("--headless") # This is an option for those people, who want to scrape info with the 
# possibility of invisible browser
browser = webdriver.Chrome(service = s, options = chrome_options)
browser.maximize_window()
url = "https://www.imdb.com/?ref_=nv_home"
browser.get(url)
browser.implicitly_wait(10)
sleep(3)
menu_button = browser.find_element(By.CSS_SELECTOR, "#imdbHeader-navDrawerOpen--desktop")
menu_button.click() # We are going to click the menu button in order to have the view on all categories
sleep(1) # We give 1 second for the categories to be loaded
# After that, we are going to click the category "Top 250 movies"
rating_button = browser.find_element(By.CSS_SELECTOR, "#imdbHeader > div.ipc-page-content-container.ipc-page-content-container--center.navbar__inner > aside > div > div.drawer__panelContent > div > div:nth-child(1) > span > div > div > ul > a:nth-child(3) > span")
rating_button.click()
sleep(5) # We give 5 seconds for the page to be loaded
# Finally, we followed the link
# So it is high time to scrape the entire rating "Top 250 movies" according to IMDb
# We are going to scrape the following film details: Title, Rating, Year, Genre and Directors

URL_links = [] # First of all, it is planned to store the URL_links of rating films within the list
page = browser.page_source
soup = BeautifulSoup(page)
for link in soup.find_all("td", class_ = "titleColumn"):
    # These URL_links are not complete, this is why it's necessary to add the first URL_segment:
    URL_links.append("https://www.imdb.com/" + link.find("a").get("href"))
    
URL_links[:10]

['https://www.imdb.com//title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2F5DJT5X5XZCQCCRYD55&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1',
 'https://www.imdb.com//title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2F5DJT5X5XZCQCCRYD55&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2',
 'https://www.imdb.com//title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2F5DJT5X5XZCQCCRYD55&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3',
 'https://www.imdb.com//title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2F5DJT5X5XZCQCCRYD55&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4',
 'https://www.imdb.com//title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=2F5DJT5X5XZCQCCRYD55&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5',
 'htt

In [29]:
len(URL_links) # The length of the container is 250, so it can be noted that URL_links of all 250 films were 
# successfully scraped

250

In [30]:
from tqdm import tqdm, trange

film_title = [] # The list will store 250 best film titles
production_year = [] # The list will store years, when 250 best films were produced
film_genre = [] # The list will store genres of 250 best films
film_rating = [] # The list will store scores of 250 best films 
film_directors = [] # The list will store names of film directors

# The next step is to iterate over URl-links and find all the relevant information for aforementioned categories
# through requests library and BeautifulSoup

for url in tqdm(URL_links):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    try:
        title = soup.find_all("div", class_ = "sc-dae4a1bc-0 gwBsXc")[0].get_text().strip()
        title = title.replace("Original title: ", "")
        film_title.append(title)
    except IndexError:
        i = soup.find_all("div", class_ = "sc-94726ce4-1 iNShGo")[0].get_text().strip()
        j = soup.find_all("div", class_ = "sc-94726ce4-3 eSKKHi")[0].get_text().strip()
        title = i.replace(j, "")
        film_title.append(title)
    year = soup.find_all("span", class_ = "sc-8c396aa2-2 itZqyK")[0].get_text().strip()
    production_year.append(year)
    genre = soup.find_all("ul", class_ = "ipc-inline-list ipc-inline-list--show-dividers baseAlt")[0].get_text().strip()
    film_genre.append(genre)
    score = soup.find_all("span", class_ = "sc-7ab21ed2-1 jGRxWM")[0].get_text().strip()
    film_rating.append(score)
    director = soup.find_all("a", class_ = "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")[0].get_text().strip()
    film_directors.append(director)

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [06:06<00:00,  1.47s/it]


In [31]:
print(len(film_title)) # Titles of 250 best films were successfully scraped
film_title[:20]

250


['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather: Part II',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Il buono, il brutto, il cattivo',
 'Forrest Gump',
 'Fight Club',
 'Inception',
 'The Lord of the Rings: The Two Towers',
 'The Empire Strikes Back',
 'The Matrix',
 'Goodfellas',
 "One Flew Over the Cuckoo's Nest",
 'Se7en',
 'Shichinin no samurai']

In [32]:
print(len(production_year)) # Production years of 250 best films were successfully scraped
production_year[:20]

250


['1994',
 '1972',
 '2008',
 '1974',
 '1957',
 '1993',
 '2003',
 '1994',
 '2001',
 '1966',
 '1994',
 '1999',
 '2010',
 '2002',
 '1980',
 '1999',
 '1990',
 '1975',
 '1995',
 '1954']

In [33]:
print(len(film_genre)) # Genres of 250 best films were successfully scraped
film_genre[:20]

250


['Drama',
 'Crime',
 'Action',
 'Crime',
 'Crime',
 'Biography',
 'Action',
 'Crime',
 'Action',
 'Adventure',
 'Drama',
 'Drama',
 'Action',
 'Action',
 'Action',
 'Action',
 'Biography',
 'Drama',
 'Crime',
 'Action']

In [34]:
print(len(film_rating)) # Scores of 250 best films were successfully scraped
film_rating[:20]

250


['9.3',
 '9.2',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6']

In [35]:
print(len(film_directors)) # Directors of 250 best films were successfully scraped
film_directors[:20]

250


['Frank Darabont',
 'Francis Ford Coppola',
 'Christopher Nolan',
 'Francis Ford Coppola',
 'Sidney Lumet',
 'Steven Spielberg',
 'Peter Jackson',
 'Quentin Tarantino',
 'Peter Jackson',
 'Sergio Leone',
 'Robert Zemeckis',
 'David Fincher',
 'Christopher Nolan',
 'Peter Jackson',
 'Irvin Kershner',
 'Lana Wachowski',
 'Martin Scorsese',
 'Milos Forman',
 'David Fincher',
 'Akira Kurosawa']

In [36]:
aggregation_list = list(zip(film_title, production_year, film_genre, film_rating, film_directors))
aggregation_list[:20] # All results are presented as a combined list of categories

[('The Shawshank Redemption', '1994', 'Drama', '9.3', 'Frank Darabont'),
 ('The Godfather', '1972', 'Crime', '9.2', 'Francis Ford Coppola'),
 ('The Dark Knight', '2008', 'Action', '9.0', 'Christopher Nolan'),
 ('The Godfather: Part II', '1974', 'Crime', '9.0', 'Francis Ford Coppola'),
 ('12 Angry Men', '1957', 'Crime', '9.0', 'Sidney Lumet'),
 ("Schindler's List", '1993', 'Biography', '9.0', 'Steven Spielberg'),
 ('The Lord of the Rings: The Return of the King',
  '2003',
  'Action',
  '9.0',
  'Peter Jackson'),
 ('Pulp Fiction', '1994', 'Crime', '8.9', 'Quentin Tarantino'),
 ('The Lord of the Rings: The Fellowship of the Ring',
  '2001',
  'Action',
  '8.8',
  'Peter Jackson'),
 ('Il buono, il brutto, il cattivo',
  '1966',
  'Adventure',
  '8.8',
  'Sergio Leone'),
 ('Forrest Gump', '1994', 'Drama', '8.8', 'Robert Zemeckis'),
 ('Fight Club', '1999', 'Drama', '8.8', 'David Fincher'),
 ('Inception', '2010', 'Action', '8.8', 'Christopher Nolan'),
 ('The Lord of the Rings: The Two Towers

In [26]:
import pandas as pd 

# The next stage is to create the dataset itself through Pandas 
with open(r"IMDb Dataset - Top 250 movies.csv", mode = "w", encoding="utf-8") as file:
    for a, b, c, d, e in aggregation_list: # 'For loop' to iterate over all categories within the aggregation_list
        file.write(a + ";" + b + ";" + c + ";" + d + ";" + e + "\n")
        
columns = ["Title", "Year", "Genre", "Rating", "Director"]
df = pd.read_csv("IMDb Dataset - Top 250 movies.csv", delimiter = ";", names = columns)
df.head(20)

Unnamed: 0,Title,Year,Genre,Rating,Director
0,The Shawshank Redemption,1994,Drama,9.3,Frank Darabont
1,The Godfather,1972,Crime,9.2,Francis Ford Coppola
2,The Dark Knight,2008,Action,9.0,Christopher Nolan
3,The Godfather: Part II,1974,Crime,9.0,Francis Ford Coppola
4,12 Angry Men,1957,Crime,9.0,Sidney Lumet
5,Schindler's List,1993,Biography,9.0,Steven Spielberg
6,The Lord of the Rings: The Return of the King,2003,Action,9.0,Peter Jackson
7,Pulp Fiction,1994,Crime,8.9,Quentin Tarantino
8,The Lord of the Rings: The Fellowship of the Ring,2001,Action,8.8,Peter Jackson
9,"Il buono, il brutto, il cattivo",1966,Adventure,8.8,Sergio Leone
