In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Inspect the Web page**

From the TMdb popular movies web page, we need to parse the movie names, user ratings and the movie url. And, from individual movie pages, we need to parse the release date, genre, runtime and director name.


All of the information in the web page is coded in HTML (HyperText Markup Language). This comes with a basic structure and order. We use HTML tag attributes to parse the page and track for specific details from the page.


On any web page, Chrome users can use the “Inspect” option by right-clicking on the page to examine the HTML code behind the page. A menu will appear, either on the bottom or right side of the page (based on the settings), with a long list of nested HTML tags. To find the correct tag associated with the information needed, select the details (ex. movie name) and click “Inspect” again and that will highlight a blue box. Now, you can click on the HTML tags and get the correct tag associated with the item of interest, here, movie name.

**Load the Webpage using Requests**

The landing page of TMdb movies page consists of a list of popular movies. We can click on each of the movie items and navigate to the individual movie page to get more details on each movie.

Each page contains 20 movies. From the landing page, we will parse the list of movies, user ratings, and movie URLs. Then, we can navigate to the next pages using the ‘Load More’ button click.

In [None]:
def get_movies_page(movies_url):
    """
    Function to download a web page using `requests` and check the status code to validate
    if the call was successful. 
    """
    #movies_url = 'https://www.themoviedb.org/movie'
    # Access the webpage using `requests`
    response = requests.get(movies_url)
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(movies_url))
    # Parse the `response' text using BeautifulSoup
    movies_doc = BeautifulSoup(response.text, 'lxml')
    return movies_doc

We can use the **h2.a.text.strip()** to retrieve the name of the movie. 


In [None]:
def get_movies_names(doc):
    """
    Function to extract the movie names from HTML source code using BeautifulSoup.
    """
    movies_names_tags = doc.find('div',{'class':'page_wrapper'}).find_all('h2')  #Exclude the first 4 lines
    movies_names = []
    # Loop through the page get all the movie names from the page
    for h2 in movies_names_tags:
        movies_names.append(h2.a.text.strip())
    return movies_names

The movie ratings are embedded in the div tag under the class: desc_selector inside data-percent.

In [None]:
def get_movies_rating(doc):
    """
    Function to extract the movie user rating from HTML source code using the BeautifulSoup. 
    """
    
    movies_rating_tags = doc.find_all('div', {'class': 'user_score_chart'})
    movies_rating = []
    # Loop through the webpage to get the ratings of all the movies in the page
    for tag in movies_rating_tags:
        movies_rating.append(tag.attrs['data-percent'])
    return movies_rating

In [None]:
def get_movies_urls(doc):
    """
    Function to extract the movie links from HTML source code using BeautifulSoup. 
    """
    movies_urls = []
    base_url = 'https://www.themoviedb.org'
    movies_names_tags = doc.find('div',{'class':'page_wrapper'}).find_all('h2') 
    # Loop through the webpage to get the URL of each movie
    for tag in movies_names_tags:
        movies_urls.append(base_url + tag.a['href'])
    return movies_urls

In [None]:
def get_movies_info(doc):
    """
    Function to get the movie informations - 
    release date, genre, runtime and director.
    """
    div1_tags = doc.find('div', class_ = 'facts')
    release_date = div1_tags.text.split()[1]
    genre = div1_tags.text.split()[3:-2]
    runtime = div1_tags.text.split()[-2:]
    
    div2_tags = doc.find_all('div', {'class':'scroller_wrap should_fade is_fading'})
    director = div2_tags[0].text.strip().partition("\n")[0]
    
    return release_date, genre, runtime, director

def get_movies_info(doc):
    """
    Function to get the movie informations - 
    release date, genre, runtime and director.
    """
    div1_tags = doc.find('div', class_ = 'facts')
    release_date = div1_tags.find('span',class_='release').text.strip()
    genre_list = doc.find('span',class_='genres').find_all('a')
    genre = []
    for i in genre_list:
        genre.append(i.text.strip())
      
    

    runtime = div1_tags.find('span',class_='runtime').text.strip()
    
    div2_tags = doc.find_all('div', {'class':'scroller_wrap should_fade is_fading'})
    director = div2_tags[0].text.strip().partition("\n")[0]
    
    return release_date, genre, runtime, director

In [None]:
get_movies_info(get_movies_page("https://www.themoviedb.org/movie/798544"))

('(JP)', ['Drama,', 'Mystery,'], ['Fantasy', '40m'], 'Chiaki Kobayashi')

In [None]:
def get_all_movies_details(urls):
    """
    Function to get lists of movie information as lists from all the pages. 
    """
    genres = []
    release_dates = []
    runtimes = []
    directors = []
    
    # Loop through all the urls of the the movies 
    for url in urls:
        movie_doc = get_movies_page(url)
        # get_movies_info returns release_date, genre, runtime, director.
        release_date, genre, runtime, director = get_movies_info(movie_doc)
        # Convert the genre list to string on ` `. 
        genres.append(" ".join(genre))
        release_dates.append(release_date)
        runtimes.append(" ".join(runtime))
        directors.append(director)
        
    return genres, release_dates, runtimes, directors

In [None]:
get_all_movies_details(get_movies_urls(get_movies_page('https://www.themoviedb.org/movie')))

(['Horror, Thriller, Romance',
  'Horror, Thriller',
  'Science Fiction',
  'Action, Comedy, Thriller',
  'Action, Adventure, Fantasy',
  'Thriller',
  'Fantasy, Comedy, Family',
  'Drama, Action, Thriller',
  'Thriller, Mystery',
  'Science Fiction, Adventure, Action',
  'Action, Fantasy, Adventure',
  'Fantasy, Adventure, Family',
  'Animation, Action, Science Fiction',
  'Action, Thriller, Drama',
  'Crime, Drama',
  'Adventure, Family, TV Movie',
  'Fantasy, Action, Comedy',
  'Thriller, Adventure, Horror',
  'Horror, Thriller, Mystery',
  'Romance, Drama'],
 ['10/14/2022',
  '07/27/2022',
  '01/06/2022',
  '08/05/2022',
  '06/24/2022',
  '08/12/2022',
  '09/30/2022',
  '09/09/2022',
  '09/23/2022',
  '08/12/2022',
  '10/21/2022',
  '09/08/2022',
  '10/18/2022',
  '09/23/2022',
  '10/05/2022',
  '10/06/2022',
  '07/08/2022',
  '08/19/2022',
  '09/15/2022',
  '09/07/2022'],
 ['1h 51m',
  '1h 39m',
  '1h 38m',
  '2h 7m',
  '2h 23m',
  '1h 47m',
  '1h 47m',
  '1h 37m',
  '1h 47m',
  '

In [None]:
def scrape_movies():
    """
    Function to download web page using `requests` and
    to extract the HTML source code using BeautifulSoup.
    """
    # Let's get the popular movies listing from the TMdb website
    page_count = 1 # Initializing the movie page count to 1
    # Define lists for all the movie attributes
    all_names = []
    all_ratings = []
    all_genres = []
    all_release_dates = []
    all_runtimes = []
    all_directors = []
    all_urls = []
    
    while page_count < 8: # Looping for 8 pages of the TMdb web page
        movies_url = "https://www.themoviedb.org/movie?page=%d" %(page_count)
        # Access the webpage using `requests`
        response = requests.get(movies_url)
        # Check if the request was successful
        if response.status_code != 200:
            raise Exception('Failed to load page {}'.format(movies_url))
        # Parse the `response' text using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')
        
        urls = get_movies_urls(doc)
        genres, release_dates, runtimes, directors = get_all_movies_details(urls)
        
        # Append each movie attribute to respective lists
        all_names += get_movies_names(doc)
        all_ratings += get_movies_rating(doc)
        all_genres += genres
        all_release_dates += release_dates
        all_runtimes += runtimes
        all_directors += directors
        all_urls += urls 
        page_count += 1

        # Defining a dictionary to store the movie informations
    movies_dict = {
        'name': all_names,
        'rating': all_ratings,
        'genre': all_genres,
        'release_date': all_release_dates,
        'runtime': all_runtimes,
        'director': all_directors,
        'url': all_urls
    }
    return pd.DataFrame(movies_dict)

In [None]:
scrape_movies()

Unnamed: 0,name,rating,genre,release_date,runtime,director,url
0,Halloween Ends,69.0,"Horror, Thriller, Romance",10/14/2022,1h 51m,Jamie Lee Curtis,https://www.themoviedb.org/movie/616820
1,Orphan: First Kill,68.0,"Horror, Thriller",07/27/2022,1h 39m,Isabelle Fuhrman,https://www.themoviedb.org/movie/760161
2,Project Gemini,55.0,Science Fiction,01/06/2022,1h 38m,Egor Koreshkov,https://www.themoviedb.org/movie/575322
3,Bullet Train,75.0,"Action, Comedy, Thriller",08/05/2022,2h 7m,Brad Pitt,https://www.themoviedb.org/movie/718930
4,Fullmetal Alchemist: The Final Alchemy,64.0,"Action, Adventure, Fantasy",06/24/2022,2h 23m,Ryosuke Yamada,https://www.themoviedb.org/movie/960704
...,...,...,...,...,...,...,...
135,365 Days: This Day,59.0,"Romance, Drama",04/27/2022,1h 51m,Anna-Maria Sieklucka,https://www.themoviedb.org/movie/829557
136,Harry Potter and the Philosopher's Stone,79.0,"Adventure, Fantasy",11/16/2001,2h 32m,Daniel Radcliffe,https://www.themoviedb.org/movie/671
137,Ivy + Bean: The Ghost That Had to Go,50,"Family, Comedy",09/02/2022,1h 2m,Keslee Blalock,https://www.themoviedb.org/movie/1014676
138,After,72.0,"Romance, Drama",04/12/2019,1h 46m,Josephine Langford,https://www.themoviedb.org/movie/537915


In [None]:
df = scrape_movies()

from pathlib import Path  
filepath = Path('drive/MyDrive/Colab Notebooks/myfile.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)