# Project 2 Part 3A: Extract from TMDB (core)

*Christina Brockway*

## Business Problem:
-  Produce a MySQL database from Movies to analyze what makes a movie successful.
-  Provide recommendations to stakeholder on how to make a successful movie.

### Issues, Need More Data:
-  Use TMDB database for financial data
-  Extract Budge, Revenue, and MPAA Rating (Certification)
-  Test API using 2001 and 2002 movies
-  Save each year separately

#### Imports

In [1]:
#Import packages
import os, time, json
import tmdbsimple as tmdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm_notebook

In [2]:
FOLDER = 'MovieData/'
os.makedirs({FOLDER}, exist_ok = True)
os.listdir({FOLDER})

['.ipynb_checkpoints', 'basics_data.csv.gz', 'ratings_data.csv.gz']

#### Load API Key

In [3]:
with open('/Users/csbro/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api_key'])

In [4]:
tmdb.API_KEY = login['api_key']

#### Define Functions and variables

In [5]:
#Define list of years to get
GET_YEARS = [2001, 2002]

#create an empty list for errors
errors=[]

In [6]:
def get_movie_with_rating(movie_id):
    #Get movie object using movie_id
    movie= tmdb.Movies(movie_id)
    #Save the dictionaries 
    movie_info = movie.info()
    releases = movie.releases()
    #Loop through countries for only US
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification']= c['certification']
    return movie_info


In [7]:
def write_json(new_data, filename):
    """Appends a list of records (new_data) into a json file (filename).
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""

    with open(filename, 'r+') as file:
        #Load existing data into dictionary
        file_data = json.load(file)
        #choose to extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        #set file's current position at offset
        file.seek(0)
        #convert back to json
        json.dump(file_data, file)

#### Confirm API Function works

In [8]:
test= ["tt0848228", "tt0332280"]
results= []
for movie_id in test:
    movie_info = get_movie_with_rating(movie_id)
    results.append(movie_info)
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.711,29282,PG-13
1,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.881,10691,PG-13


### Load in Data

In [9]:
basics = pd.read_csv("data/basics-filtered.csv")
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama


### Create Inner and Outer Loops

In [10]:
##OUTER LOOP
for YEAR in tqdm_notebook(GET_YEARS, desc='YEARS', position=0):
  
    #Prepare DF for json file
    JSON_MOVIE= f'{FOLDER}tmdb_api_results {YEAR}.json'
        #Check if file exists
    file_exists = os.path.isfile(JSON_MOVIE)
    
    if file_exists == False:
        print(f'Creating json file for API results for {YEAR}')
        with open(JSON_MOVIE, 'w') as f:
            json.dump([{'imdb_id':0}], f)
    else: 
        print(f'{JSON_MOVIE} already exists.')
    
    #Saving 2010  as the current df
    df = basics.loc[basics['startYear'] == YEAR].copy()
    #saving movie_id to separate variable
    movie_ids = df['tconst'].copy() #.to_list()

    #Load exisiting data from json into DF called previous_df
    previous_df = pd.read_json(JSON_MOVIE)

    #filter out any ids that are already in the file
    needed_mids = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #INNER LOOP
    for movie_id in tqdm_notebook(needed_mids,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            #Append/Extend results to json file
            write_json(temp, JSON_MOVIE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])

    print(f' - Total Errors: {len(errors)}')    


    final_year_df = pd.read_json(JSON_MOVIE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression= 'gzip', index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Creating json file for API results for 2001


Movies from 2001:   0%|          | 0/1576 [00:00<?, ?it/s]

 - Total Errors: 219
Creating json file for API results for 2002


Movies from 2002:   0%|          | 0/1572 [00:00<?, ?it/s]

 - Total Errors: 497
