In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests import get
import numpy as np
from time import sleep
from random import randint
import re

In [2]:
# create empty lists for each variable
titles = []
release = []
runtime = []
directors = []
actors = []
votes = []
grosses = []
rating = []
audience_rating = []
genres = []

pages = np.arange(1, 501, 50)  # allows the scraper to iterate through multiple pages (each page = 50 entries)

for page in pages:   # this loop iterates through multiple web pages 
    # gives the starting web page, and then adds in the elements needed for the crawling program to move onto the URL of the next page
    # for a different year, simply edit the required year within the URL
    page = requests.get('https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&sort=num_votes,desc&start=' + str(page) + '&ref_adv')
    bs_movie = BeautifulSoup(page.text, 'html.parser')
    movies = bs_movie.find_all('div', class_='lister-item mode-advanced')
    
    sleep(randint(2,10)) # delays requests to the website by a random amount of seconds so that our scraper doesn't overwhelm the site

    # this function is used in the following function to retrieve the text values (opposed to numerical values)
    def text_value(movie, tag, class_=None):
        if movie.find(tag, class_):
            return movie.find(tag, class_).text
        else:
            return

    # this function is used to find data which may be nested within several layers of generic HTML tags
    # for example, the Runtime and Genres are under the same 'span' class, but different 'a'tags
    # this function finds the correct tags and extracts the wanted value for the particular attribute
    def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
        movies = soup.findAll('div', class_='lister-item-content')
        data_list = []
        for movie in movies:
            if text_attribute:
                if nested:
                    data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
                else:
                    data_list.append(text_value(movie, tag_1, class_1))
            else:
                data_list.append(numeric_value(movie, tag_1, class_1, order))

        return data_list

    # this loop grabs the needed attributes from each entry on a particular page
    for movie in movies:
        title = movie.find('h3', href='').find('a').text
        titles.append(title)
    
        year = movie.find('span', class_='lister-item-year text-muted unbold').text
        year = re.sub(r"[()]", " ", year)
        release.append(year)
    
        minutes = extract_attribute(movie, 'span', 'runtime')
        minutes = str(minutes)[2:-2]
        runtime.append(minutes)
    
        director = movie.find('p', class_='').find_all('a')[0].text
        directors.append(director)
    
        actors.append([a.text for a in movie.find('p', class_='').find_all('a')[1:]])
        
        nv = movie.find_all('span', attrs = {'name':'nv'})
    
        vote = nv[0].text
        votes.append(vote)
    
        gross = nv[1].text if len(nv) > 1 else '-'
        grosses.append(gross)
    
        imdb_rating = movie.find('strong').text
        rating.append(imdb_rating)
    
        l_rating = extract_attribute(movie, 'span', 'certificate')
        l_rating = str(l_rating)[2:-2]
        audience_rating.append(l_rating)
    
        genre = extract_attribute(movie, 'span', 'genre')
        genre = str(genre)[4:-2]
        genres.append(genre)


In [3]:
# create a dictionary to map the lists to columns within a dataframe
df_dict = {'Titles': titles, 'Release': release, 'Runtime': runtime, 'Director': directors, 
          'Actors': actors, 'Votes': votes, 'Earnings': grosses, 'IMDB rating': rating,
          'MPAA rating': audience_rating, 'Genres': genres}
df = pd.DataFrame(df_dict)
df

Unnamed: 0,Titles,Release,Runtime,Director,Actors,Votes,Earnings,IMDB rating,MPAA rating,Genres
0,Tenet,2020,150 min,Christopher Nolan,"[John David Washington, Robert Pattinson, Eliz...",164374,$53.80M,7.7,PG-13,"Action, Sci-Fi"
1,Birds of Prey: And the Fantabulous Emancipatio...,2020,109 min,Cathy Yan,"[Margot Robbie, Rosie Perez, Mary Elizabeth Wi...",155244,$84.16M,6.1,R,"Action, Adventure, Comedy"
2,The Invisible Man,I 2020,124 min,Leigh Whannell,"[Elisabeth Moss, Oliver Jackson-Cohen, Harriet...",153099,$64.91M,7.1,R,"Drama, Horror, Mystery"
3,Extraction,2020,116 min,Sam Hargrave,"[Chris Hemsworth, Bryon Lerum, Ryder Lerum, Ru...",150770,-,6.7,R,"Action, Thriller"
4,Bad Boys for Life,2020,124 min,Adil El Arbi,"[Bilall Fallah, Will Smith, Martin Lawrence, V...",121983,$206.31M,6.6,R,"Action, Comedy, Crime"
...,...,...,...,...,...,...,...,...,...,...
495,Winter,2020,76 min,Sergey Chernikov,"[Nikita Abdulov, Aleksandr Ablyazov, Aleksey D...",839,-,7.1,on,Drama
496,Kaali Khuhi,2020,90 min,Terrie Samundra,"[Sanjeeda Sheikh, Shabana Azmi, Satyadeep Misr...",839,-,3.4,TV-14,"Drama, Horror, Mystery"
497,Psy 3: W imie zasad,2020,126 min,Wladyslaw Pasikowski,"[Boguslaw Linda, Marcin Dorocinski, Cezary Paz...",838,-,6.3,on,"Action, Crime, Drama"
498,Before the Fire,2020,90 min,Charlie Buhler,"[Jenna Lyng Adams, Jackson Davis, Spencer Berr...",836,-,3.8,on,"Drama, Thriller"


In [4]:
df.to_csv(r'C:/Users/Loyd/Desktop/Web Analytics/Project Files/2020_movies.csv', index = False)