# Importing Libraries

In [1]:
import json, os, math, time
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
from tqdm import tqdm_notebook

# File Goals
This file is going to retrieve the budget, revenue, and MPAA rating from TMDB using an api call for the data in the basics file. We will pull the data for 2000 and 2001 as a proof of concept, and save the results in a separate csv.gz file

In [2]:
# Loading my API Key for TMDB
with open('/Users/Ray/.secret/tmdb_api.json', 'r') as f:
    key = json.load(f)

key.keys()

dict_keys(['api-key'])

In [3]:
tmdb.API_KEY = key['api-key']

In [4]:
# Checking/creating the data folder
folder = "Data/"
os.makedirs(folder, exist_ok = True)
os.listdir(folder)

['.ipynb_checkpoints',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

I see the 3 files from the previous notebook in the folder

# Functions for API Calls

## Get Movie

In [5]:
# Defining a function to pull movie info with rating
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

In [6]:
# Testing the function with The Avengers
get_movie_with_rating("tt0848228")

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 110.144,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [7]:
# Testing the function with The Notebook
get_movie_with_rating('tt0332280')

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 54.214,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/5ThIuO93vsk47oexKTSdfKEr7EC.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

## Write Json

In [8]:
def write_json(new_data, filename):
    
    with open(filename, 'r+') as file:
        file_data = json.load(file)
        
        if (type(new_data)==list) & (type(file_data)==list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        
        file.seek(0)
        
        json.dump(file_data, f)

# Putting together the loop

In [9]:
# Pulling the basics file to parse through ID's and StartYears
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [10]:
# Defining my years to get
years = [2000,2001]
# Empty list to hold api call errors
errors = []

In [11]:
# Start of Outer Loop
for year in tqdm_notebook(years, desc = 'Years', position = 0):
    
    # Defining the json file to store results in
    json_file = f'{folder}tmdb_api_results_{year}.json'
    file_exists = os.path.isfile(json_file)
    # Creating the file if it doesn't exist
    if file_exists == False:
        with open(json_file, 'w') as f:
            json.dump([{'imdb_id':0}],f)
    
    # Creating a dataframe filtered by the year
    df = basics.loc[basics['startYear'] == year].copy()
    # Taking the ID's from the dataframe
    movie_ids = df['tconst'].copy()
    
    # Storing the previous results in a dataframe 
    previous_results = pd.read_json(json_file)
    # Choosing only Id's that are NOT in the previous results
    ids_to_get = movie_ids[~movie_ids.isin(previous_results['imdb_id'])]
    
    # Start of Inner loop
    for movie_id in tqdm_notebook(ids_to_get, desc = f'Movies from {year}',
                                 position = 1, leave = True):
        try:
            # Using the prebuilt function to pull the attributes
            temp = get_movie_with_rating(movie_id)
            # Append results to existing file using the prebuit function
            write_json(temp, json_file)
            
            time.sleep(.02)
        
        except Exception as e:
            errors.append([movie_id, e])
    
    # Saving the data
    final_df = pd.read_json(json_file)
    final_df.to_csv(f'{folder}final_tmdb_data_{year}.csv.gz', compression = 'gzip',
                   index = False)

print(f'[info] Total errors: {len(errors)}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for year in tqdm_notebook(years, desc = 'Years', position = 0):


Years:   0%|          | 0/2 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(ids_to_get, desc = f'Movies from {year}',


Movies from 2000:   0%|          | 0/1448 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie_id in tqdm_notebook(ids_to_get, desc = f'Movies from {year}',


Movies from 2001:   0%|          | 0/1571 [00:00<?, ?it/s]

KeyboardInterrupt: 