# TMDB Get Data

Workflow: 4  

Goal: To start with the reelgood movie titles and get TMDB data for them all.   

Result: The file ```moviedb_movie.csv``` is created.  

In [1]:
import os
import time
import random as RD
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import moviedb_helper as MH 

## Get Data For One Movie 

In [4]:
URL_FILE = '../data/reelgood_url.csv'
url_df = PD.read_csv(URL_FILE)
url_df.head()
url_df.shape

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


(18496, 3)

In [5]:
client = MH.MovieDBHelper()
movie_ls = []
t0 = time.time()

for idx, row in url_df[:100].iterrows():
    if idx % 500 == 0:
        print(f'row: {idx}')

    title = row['title']
    year = row['year']
    try:
        movie_dx = client.get_movie_data(title, year)
        movie_ls.append(movie_dx)
    except Exception as ex:
        print(ex)

t1 = time.time()
print(f'movies: {len(movie_ls)}')
print(f'time: {(t1-t0)/60:.2f} mins')

No movies found: $100 and a T-Shirt: A Documentary About Zines in the Northwest (2004).
movies: 99
time: 2.41 mins


In [6]:
# projected time is rate * number

print(f'projected time: {(t1-t0)/len(movie_ls)*url_df.shape[0]/60/60:.2f} hrs')

projected time: 7.50 hrs


In [7]:
movie_df = PD.DataFrame(movie_ls)
movie_df.head()
movie_df.info()

Unnamed: 0,title,original_title,year,rating,companies,country,language,runtime,crew,cast,genres,collection,synopsis,budget,gross,tmdb_score,tmdb_votes,tmdb_id,imdb_id
0,1,1,2013,,"Exclusive Media, Flat-Out Films",United States of America,English,112.0,"Paul Crowder, Mark Monroe, Michael Shevloff","Niki Lauda, Michael Schumacher, Lewis Hamilton",Documentary,,Set in the golden era of Grand Prix Racing '1'...,,,7.4,59,217316,tt2518788
1,"10,000 BC","10,000 BC",2008,,"Centropolis Entertainment, Legendary Entertain...",United States of America,English,109.0,"Roland Emmerich, Sarah Bradshaw, Tom Karnowski","Steven Strait, Camilla Belle, Cliff Curtis","Adventure, Action, Drama, Fantasy",,A prehistoric epic that follows a young mammot...,105000000.0,266000000.0,5.3,1765,7840,tt0443649
2,1000 Rupee Note,Ek Hazarachi Note,2016,,,India,Marathi (Marāṭhī),89.0,"Shrihari Sathe, Shrikant Bojewar",Sandeep Pathak,Drama,,Poor Parobudhi receives a thousand rupee note ...,,,6.9,7,318654,tt2937158
3,1000 To 1,1000 To 1,2014,,,United States of America,English,99.0,Michael Levine,"David Henrie, Hannah Marks, Luke Kleintank",Drama,,Cory Weissman is a college basketball player w...,,,5.9,19,268245,tt2391950
4,100% arabica,100% arabica,1997,,,,French,,Mahmoud Zemmouri,,Drama,,"The movie takes place in a poverty-stricken, r...",,,6.0,1,99800,tt0156245


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 19 columns):
title             99 non-null object
original_title    99 non-null object
year              99 non-null object
rating            0 non-null object
companies         58 non-null object
country           71 non-null object
language          99 non-null object
runtime           89 non-null float64
crew              99 non-null object
cast              90 non-null object
genres            99 non-null object
collection        11 non-null object
synopsis          99 non-null object
budget            18 non-null float64
gross             8 non-null float64
tmdb_score        99 non-null float64
tmdb_votes        99 non-null int64
tmdb_id           99 non-null int64
imdb_id           98 non-null object
dtypes: float64(4), int64(2), object(13)
memory usage: 14.8+ KB


In [8]:
save_path = r'../data/moviedb_movie.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.05 mb'