In [1]:
# THIS CELL IS CREATE FOR MAKING CONNECTION BETWEEN GOOGLE COLAB AND GOOGLE DRIVE
# IGNORING IT IF YOU ARE NOT WORKING ON GOOGLE COLAB

#from google.colab import drive

#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# THEMOVIEDB API

# This api provide a dataset about movies which consist of some information
# like  title, language, overview, revenue, etc. For details, checking the website
# https://www.themoviedb.org/documentation/api.
# At the time I wrote, 17-11-2019, this api is FREE.
DISCOVER_BASE_URL = 'https://api.themoviedb.org/3/discover/movie'
MOVIE_DETAILS_BASE_URL = 'https://api.themoviedb.org/3/movie'

# Api key can easy generate by create an account on THEMOVIEDB system.
API_KEY = '72528b6684dc6c97ba6a70e28f0d0cbe'

In [0]:
# IMPORT LIBRARY

import requests
import pandas as pd
import time

In [4]:
# TESTING API

# Api responses are index of current page, results (list of movies), total results and total pages.
# Length of results is fixed which is 20 in each page.

print('Testing api...')

# Create url
url = f'{DISCOVER_BASE_URL}?api_key={API_KEY}'

# Using REQUESTS library for requesting resource
res = requests.get(url)

data = res.json()

# Checking data
assert res.ok == True
assert data['page'] == 1
assert type(data['total_pages']) == type(1)
assert type(data['total_results']) == type(1)
assert type(data['results']) == type([])

print('It worked!!!')

Testing api...
It worked!!!


In [0]:
# WRITING FUNCTIONS TO GET MOVIE DATA

def collect_movie_details(base_url, api_key, movie_id, fields = None, sleep_time = 1):
  """ Collects movie data that are provided by api.

  Parameters:
  ----------
  base_url: str
    Base url of this api.
    Example: api.example.com.
  api_key: str
    Api key is generated by user account
  movie_id: int
    Id of movie which is indexed by server.
  fields: list
    List of movie informations that you want from this api.
    Example: ['title', 'overview', 'revenue']
    In default, function will take all informations of movie.
  sleep_time: int
    Number of seconds to sleep program when triggering the limit of api.
    Default is 1.

  Returns:
  ----------
  dict
    A dictionary of movie data which consist of all information that you fill
    in 'fields' parameter and default value is None if information is not found.
  """

  url = f'{base_url}/{movie_id}?api_key={api_key}'
  finish = False
  movie = None

  while not(finish):
    # Getting data
    res = requests.get(url)

    if res.ok:
      data = res.json()
      if fields is None:
        movie = data
      else:
        movie = dict((field, data[field]) for field in fields)
      finish = True
    else:
      time.sleep(sleep_time)
  
  return movie


def collect_all_movies(base_url, api_key, fields = None, sleep_time = 1):
  """ Collects all movies that are provided by api.

  Parameters:
  ----------
  base_url: str
    Base url of this api.
    Example: api.example.com
  api_key: str
    Api key is generated by user account.
  fields: list
    List of movie informations that you want from this api.
    Example: ['title', 'overview', 'revenue']
    In default, function will take all information of movie.
  sleep_time: int
    Number of seconds to sleep program when triggering the limit of api.
    Default is 1.

  Returns:
  ----------
  list
    A list of movies.
    In which each object consist of all information that you fill in 'fields' parameter
    and default value is None if information is not found.
  """

  # Getting summarization of data
  url = f'{base_url}?api_key={api_key}'
  res = requests.get(url)
  summarization = res.json()

  total_results = summarization['total_results']
  total_pages = summarization['total_pages']
  page_index = summarization['page']

  # List of movies
  all_movies = []

  finish = False

  while not(finish):
    # Getting data
    url = f'{base_url}?api_key={api_key}&page={page_index}'
    res = requests.get(url)

    if res.ok: # if requesting success
      data = res.json()
      results = data['results']

      all_movies += [collect_movie_details(MOVIE_DETAILS_BASE_URL, api_key, movie['id']) for movie in results]

      # Updating stop condition
      finish = (True, False)[len(all_movies) < total_results]
      page_index += 1
    else: # if triggering api limit
      time.sleep(sleep_time)

  return all_movies

In [0]:
#COLLECTING ALL MOVIES

# Getting movies from api
movies = collect_all_movies(DISCOVER_BASE_URL, API_KEY)

# Converting list of movies to pandas datafram
movies_df = pd.DataFrame(movies)

# Standardize data
movies_df['genres'] = movies_df['genres'].apply(lambda elements: ', '.join(element['name'] for element in elements))
movies_df['production_companies'] = movies_df['production_companies'].apply(lambda elements: ', '.join(element['name'] for element in elements))
movies_df['production_countries'] = movies_df['production_countries'].apply(lambda elements: ', '.join(element['name'] for element in elements))
movies_df['spoken_languages'] = movies_df['spoken_languages'].apply(lambda elements: ', '.join(element['name'] for element in elements))
movies_df.drop(columns='belongs_to_collection', inplace=True)

In [0]:
# STORING DATA

movies_df.to_csv('all_movies.csv', index = False, sep='\t')

In [0]:
# STORING DATA TO GOOGLE DRIVE
# IGNORING IT IF YOU ARE NOT WORKING ON GOOGLE COLAB

#!cp 'all_movies.csv' 'drive/My Drive/Data Science Course'