# API CALLS and Save results

In [1]:
# Install tmdbsimple (only need to run once)
!pip install tmdbsimple



In [2]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, time
from tqdm.notebook import tqdm_notebook

In [3]:
#Load TMDB API Key & Add to tmdbsimple
with open('/Users/kellyji/.secret/tmdb_api.json','r') as f:
    login=json.load(f)

login.keys()

dict_keys(['api-key', 'API Read Access Token'])

In [4]:
# Import tmdbsimple and setting the API keys
import tmdbsimple as tmdb
tmdb.API_KEY = login['api-key']

In [5]:
FOLDER='Data/'
os.listdir(FOLDER)

['rating.csv',
 'basics.csv',
 'movieERD.png',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'moviesqlscript.mwb',
 'title-akas-us-only.csv',
 'moviesqlscript.sql',
 '.ipynb_checkpoints']

In [6]:
# Define customized functions
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    
    movie_info = movie.info()
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1' ] =='US':
            movie_info['certification'] = c['certification']
    return movie_info


def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        file_data = json.load(file)
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        file.seek(0)
        json.dump(file_data, file)

In [7]:
# Load in the cleaned Title Basics
basics=pd.read_csv('Data/basics.csv')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [9]:
# Check API function
get_movie_with_rating('tt0848228')

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 153.756,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [10]:
get_movie_with_rating('tt0332280')

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 89.271,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

- Confirmed API function is working properly.

In [43]:
YEAR = 2001
errors = []
file_exists = os.path.isfile(JSON_FILE_2001)
JSON_FILE_2001 = f'{FOLDER}tmdb_api_results_{YEAR}.json'

if file_exists == False:
    print(f'Creating{JSON_FILE_2001} for API results for year = {YEAR}.')
    with open (JSON_FILE_2001, 'w') as f:
        json.dump([{'imdb_id':0}],f)

else:
    print(f'The file {JSON_FILE_2001} already exists.')
    


The file Data/tmdb_api_results_2001.json already exists.


In [45]:
#filtering for movies from selected startYear
df2001 = basics.loc[basics['startYear']==YEAR].copy()
movie_ids = df2001['tconst']
movie_ids

0        tt0035423
10       tt0114447
14       tt0116916
18       tt0118589
19       tt0118652
           ...    
84623    tt9071078
85100    tt9212730
85135    tt9228234
85957    tt9555974
86015    tt9578462
Name: tconst, Length: 1576, dtype: object

In [39]:
# Load existing data from json into a dataframe called 'previous_df'
previous_df = pd.read_json(JSON_FILE_2001)
previous_df

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/tJLV3BAlHOgscVOrA99Wnb2gAef.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.325,1243.0,PG-13
2,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,
3,tt0116916,0.0,/rFpHBidSlhjflmnLu7BZilyKeQR.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,73549.0,en,The Dark Mist,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Dark Mist,0.0,3.5,2.0,PG
4,tt0118589,0.0,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696.0,en,Glitter,...,5271666.0,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,0.0,4.405,132.0,PG-13
5,tt0118652,0.0,/mWxJEFRMvkG4UItYJkRDMgWQ08Y.jpg,,1000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,17140.0,en,The Attic Expeditions,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,His search for peace of mind... will leave his...,The Attic Expeditions,0.0,5.156,32.0,R
6,tt0119004,0.0,/r7bxHKEzgrURm9qhEaAayIMG4Xi.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,37857.0,en,Don's Plum,...,6297.0,108.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Tonight's Special - Group Therapy,Don's Plum,0.0,5.3,74.0,
7,tt0120166,0.0,/havCE85OV7FUMWzqAZ9x31XRRCA.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",,50944.0,en,The Sorcerer's Apprentice,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Sorcerer's Apprentice,0.0,4.6,11.0,NR
8,tt0120624,0.0,/gaVOAGPq2LRvwmtUI6dLiBpnql3.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,50230.0,pt,Camarate,...,0.0,100.0,"[{'english_name': 'Portuguese', 'iso_639_1': '...",Released,,Camarate,0.0,7.286,7.0,
9,tt0120681,0.0,/xo2S7gRwCvWdVqM0Swv37yA2rzw.jpg,,35000000.0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,768.0,en,From Hell,...,74558115.0,122.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Only the legend will survive.,From Hell,0.0,6.686,2609.0,R


In [40]:
#filter out any ids that are alrady in the JSON_FILE_2001
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

In [44]:
# Loop through movie_ids with a tqdm progress bar
for movie_id in tqdm_notebook(movie_ids_to_get, f'Movies from {YEAR}'):
    try:
        temp= get_movie_with_rating(movie_id)
        write_json(temp,JSON_FILE_2001)
        time.sleep(0.02)

    except Exception as e:
        errors.append([movie_id,e])

Movies from 2001:   0%|          | 0/1548 [00:00<?, ?it/s]

In [46]:
print(f'-Total errors: {len(errors)}')

-Total errors: 217


In [48]:
# Save the results to a csv.gz file
final_year_df1 = pd.read_json(JSON_FILE_2001)
csv_fname= f'{FOLDER}final_tmdb_data_{YEAR}.csv.gz'
final_year_df1.to_csv(csv_fname, compression='gzip', index=False)

In [84]:
final_year_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360 entries, 0 to 1359
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1360 non-null   object 
 1   adult                  1359 non-null   float64
 2   backdrop_path          757 non-null    object 
 3   belongs_to_collection  100 non-null    object 
 4   budget                 1359 non-null   float64
 5   genres                 1359 non-null   object 
 6   homepage               1359 non-null   object 
 7   id                     1359 non-null   float64
 8   original_language      1359 non-null   object 
 9   original_title         1359 non-null   object 
 10  overview               1359 non-null   object 
 11  popularity             1359 non-null   float64
 12  poster_path            1232 non-null   object 
 13  production_companies   1359 non-null   object 
 14  production_countries   1359 non-null   object 
 15  rele

---

In [70]:
# Set the year to filter for 
YEAR1 = 2002
# Create an empty list for saving errors
errors =[]

In [52]:
# Define the JSON file to store results for the year
JSON_FILE_2002 = f'{FOLDER}tmdb_api_results_{YEAR1}.json'
file_exists = os.path.isfile(JSON_FILE_2002)

if file_exists == False:
    print(f'Creating {JSON_FILE_2002} for API results for year={YEAR1}.')
    with open (JSON_FILE_2002, 'w') as f:
        json.dump([{'imdb_id':0}],f)

else:
    print(f'The file {JSON_FILE_2002} already exists.')

Creating Data/tmdb_api_results_2002.json for API results for year=2002.


In [59]:
# Filtering for movies from selected startYear
df2002=basics.loc[basics['startYear']==YEAR1].copy()
movie_ids_2002 = df2002['tconst']
movie_ids_2002

4        tt0096056
23       tt0118926
32       tt0119980
44       tt0120679
51       tt0120804
           ...    
82658    tt8474326
83742    tt8825252
84317    tt8993336
86310    tt9683502
86850    tt9874290
Name: tconst, Length: 1572, dtype: object

In [79]:
len(movie_ids_2002)

1572

In [62]:
# Load existing data from json into a dataframe called 'previous_df2002'
previous_df1 = pd.read_json(JSON_FILE_2002)
previous_df1

Unnamed: 0,imdb_id
0,0


In [85]:
movie_ids_to_get_2002 = movie_ids_2002[~movie_ids_2002.isin(previous_df1['imdb_id'])]
movie_ids_to_get_2002

4        tt0096056
23       tt0118926
32       tt0119980
44       tt0120679
51       tt0120804
           ...    
82658    tt8474326
83742    tt8825252
84317    tt8993336
86310    tt9683502
86850    tt9874290
Name: tconst, Length: 1572, dtype: object

In [89]:
for movie_id in tqdm_notebook(movie_ids_to_get_2002, f'Movies from {YEAR1}'):
    try:
        temp = get_movie_with_rating(movie_id)
        write_json(temp,JSON_FILE_2002)
        time.sleep(0.02)

    except Exception as e:
        errors.append([movie_id,e])

Movies from 2002:   0%|          | 0/1572 [00:00<?, ?it/s]

In [94]:
final_year_df2 = pd.read_json(JSON_FILE_2002)

In [95]:
final_year_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                1296 non-null   object 
 1   adult                  1295 non-null   float64
 2   backdrop_path          786 non-null    object 
 3   belongs_to_collection  122 non-null    object 
 4   budget                 1295 non-null   float64
 5   genres                 1295 non-null   object 
 6   homepage               1295 non-null   object 
 7   id                     1295 non-null   float64
 8   original_language      1295 non-null   object 
 9   original_title         1295 non-null   object 
 10  overview               1295 non-null   object 
 11  popularity             1295 non-null   float64
 12  poster_path            1182 non-null   object 
 13  production_companies   1295 non-null   object 
 14  production_countries   1295 non-null   object 
 15  rele

In [96]:
csv_fname1 = f'{FOLDER}final_tmdb_data_{YEAR1}.csv.gz'
final_year_df2.to_csv(csv_fname1, compression='gzip', index=False)

---

## Combine All API results

In [97]:
import glob
tmdb_files = sorted(glob.glob('Data/final_tmdb_data*.csv.gz'))
tmdb_files

['Data/final_tmdb_data_2001.csv.gz', 'Data/final_tmdb_data_2002.csv.gz']

In [98]:
# Combine with concat to load all files
df=pd.concat([pd.read_csv(f, lineterminator='\n') for f in tmdb_files])

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2656 entries, 0 to 1295
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2656 non-null   object 
 1   adult                  2654 non-null   float64
 2   backdrop_path          1543 non-null   object 
 3   belongs_to_collection  222 non-null    object 
 4   budget                 2654 non-null   float64
 5   genres                 2654 non-null   object 
 6   homepage               213 non-null    object 
 7   id                     2654 non-null   float64
 8   original_language      2654 non-null   object 
 9   original_title         2654 non-null   object 
 10  overview               2596 non-null   object 
 11  popularity             2654 non-null   float64
 12  poster_path            2414 non-null   object 
 13  production_companies   2654 non-null   object 
 14  production_countries   2654 non-null   object 
 15  rele

In [100]:
#Save final merged csv as 'tmdb_results_combined.csv.gz'
fname = f'{FOLDER}tmdb_results_combined.csv.gz'
df.to_csv(fname, compression='gzip',index=False)