# TMDB Get Data

Workflow: 4  

Goal: To start with the reelgood movie titles and get TMDB data for them all.   

Result: The file ```moviedb_movie.csv``` is created.  

In [1]:
import os
import time
import random as RD
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import moviedb_helper as MH 

## Get Data For All Reelgood Movies 

In [4]:
URL_FILE = '../data/reelgood_movie.csv'
url_df = PD.read_csv(URL_FILE)
url_df.head()
url_df.shape

Unnamed: 0,title,year,duration,rating,country,poster,genres,tags,imdb_score,rt_score,synopsis,services
0,1,2013,112.0,13+ (PG-13),America,https://img.reelgood.com/content/movie/9b61df6...,"Documentary, Mystery, Thriller, Science-Fictio...","Car, Racing",8.0,,Set in the golden era of Grand Prix Racing '1'...,"{""reelgood"": ""1-2013"", ""amazon"": ""B0751Q3J8P""}"
1,"10,000 BC",2008,109.0,13+ (PG-13),South Africa,https://img.reelgood.com/content/movie/84ff615...,"Action & Adventure, Drama, Fantasy, History","Egypt, Animal, Fighting, Hunting",5.1,8%,A prehistoric epic that follows a young mammot...,"{""reelgood"": ""10000-bc-2008"", ""netflix"": ""7006..."
2,1000 Rupee Note,2016,89.0,,,https://img.reelgood.com/content/movie/08248c1...,Drama,,7.3,80%,Poor Parobudhi receives a thousand rupee note ...,"{""reelgood"": ""1000-rupee-note-2016"", ""netflix""..."
3,1000 To 1,2014,99.0,,,https://img.reelgood.com/content/movie/dcc9187...,"Drama, Biography",,6.9,,Cory Weissman is a college basketball player w...,"{""reelgood"": ""1000-to-1-2014"", ""amazon"": ""B07D..."
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,51.0,,,https://img.reelgood.com/content/movie/d2a26a5...,Documentary,,6.3,,A cultural analysis of what causes zine maker...,"{""reelgood"": ""100-and-a-tshirt-a-documentary-a..."


(18498, 12)

In [5]:
client = MH.MovieDBHelper()
movie_ls = []
t0 = time.time()

for idx, row in url_df.iterrows():
    if idx % 500 == 0:
        print(f'row: {idx}')

    title = row['title']
    year = row['year']
    try:
        tmdb_id = client.get_movie_id(title, year)
        movie_dx = client.get_movie_by_id(tmdb_id)
        movie_ls.append(movie_dx)
    except Exception as ex:
        print(ex)

t1 = time.time()
print(f'movies: {len(movie_ls)}')
print(f'time: {(t1-t0)/60/60:.2f} hrs')

row: 0
No movie found: $100 and a T-Shirt: A Documentary About Zines in the Northwest (2004).


KeyboardInterrupt: 

In [None]:
# projected time is rate * number

#print(f'projected time: {(t1-t0)/len(movie_ls)*url_df.shape[0]/60/60:.2f} hrs')

In [None]:
movie_df = PD.DataFrame(movie_ls)
movie_df.head()
movie_df.info()

In [None]:
save_ path = r'../data/moviedb_movie.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

## Fix Up The File

In [35]:
# turns out the full file has many errors not present in the subsample file
# Int64 is the nullable int type

movie_df = PD.read_csv('../data/moviedb_movie_1.csv', dtype={'title': str})
movie_df.head(2)
movie_df.info()

Unnamed: 0,title,original_title,year,companies,country,language,run_time,crew,cast,poster,genres,collection,synopsis,budget,gross,score,votes,tmdb_id,imdb_id
0,1,1,2013.0,"Exclusive Media, Flat-Out Films",United States of America,English,112,"Paul Crowder, Mark Monroe, Michael Shevloff","Niki Lauda, Michael Schumacher, Lewis Hamilton",/4uIPXX8DjTsCzUAdtMKHTpojYLq.jpg,Documentary,,Set in the golden era of Grand Prix Racing '1'...,,,7.4,59.0,217316.0,tt2518788
1,"10,000 BC","10,000 BC",2008.0,"Centropolis Entertainment, Legendary Entertain...",United States of America,English,109,"Roland Emmerich, Sarah Bradshaw, Tom Karnowski","Steven Strait, Camilla Belle, Cliff Curtis",/rnGR3EHkL4ryhQd50XBrtRrV8nq.jpg,"Adventure, Action, Drama, Fantasy",,A prehistoric epic that follows a young mammot...,105000000.0,266000000.0,5.3,1766.0,7840.0,tt0443649


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197766 entries, 0 to 197765
Data columns (total 19 columns):
title             197766 non-null object
original_title    33925 non-null object
year              33925 non-null float64
companies         11603 non-null object
country           13587 non-null object
language          33925 non-null object
run_time          32765 non-null object
crew              32977 non-null object
cast              16017 non-null object
poster            33553 non-null object
genres            16124 non-null object
collection        1189 non-null object
synopsis          33794 non-null object
budget            2865 non-null float64
gross             1910 non-null float64
score             17683 non-null float64
votes             17683 non-null float64
tmdb_id           17683 non-null float64
imdb_id           17471 non-null object
dtypes: float64(6), object(13)
memory usage: 28.7+ MB


In [36]:
movie_df = movie_df.drop_duplicates()
movie_df.shape

(17675, 19)

In [37]:
movie_df = movie_df.loc[movie_df['tmdb_id'].isna()==False]
movie_df.shape

(17668, 19)

In [50]:
movie_df['year'] = movie_df['year'].astype('int')
movie_df['run_time'] = movie_df['run_time'].astype('Int64')
movie_df['budget'] = movie_df['budget'].astype('Int64')
movie_df['gross'] = movie_df['gross'].astype('Int64')
movie_df['votes'] = movie_df['votes'].astype('Int64')
movie_df['tmdb_id'] = movie_df['tmdb_id'].astype('Int64')

movie_df.head()
movie_df.info()

Unnamed: 0,title,original_title,year,companies,country,language,run_time,crew,cast,poster,genres,collection,synopsis,budget,gross,score,votes,tmdb_id,imdb_id
0,1,1,2013,"Exclusive Media, Flat-Out Films",United States of America,English,112.0,"Paul Crowder, Mark Monroe, Michael Shevloff","Niki Lauda, Michael Schumacher, Lewis Hamilton",/4uIPXX8DjTsCzUAdtMKHTpojYLq.jpg,Documentary,,Set in the golden era of Grand Prix Racing '1'...,,,7.4,59,217316,tt2518788
1,"10,000 BC","10,000 BC",2008,"Centropolis Entertainment, Legendary Entertain...",United States of America,English,109.0,"Roland Emmerich, Sarah Bradshaw, Tom Karnowski","Steven Strait, Camilla Belle, Cliff Curtis",/rnGR3EHkL4ryhQd50XBrtRrV8nq.jpg,"Adventure, Action, Drama, Fantasy",,A prehistoric epic that follows a young mammot...,105000000.0,266000000.0,5.3,1766,7840,tt0443649
2,1000 Rupee Note,Ek Hazarachi Note,2016,,India,Marathi (Marāṭhī),89.0,"Shrihari Sathe, Shrikant Bojewar",Sandeep Pathak,/pNNxwXAReV4kh7TCZGqBrl9I72v.jpg,Drama,,Poor Parobudhi receives a thousand rupee note ...,,,6.9,7,318654,tt2937158
3,1000 To 1,1000 To 1,2014,,United States of America,English,99.0,Michael Levine,"David Henrie, Hannah Marks, Luke Kleintank",/sGKWrPbYykh6H943XQODN5N8wxi.jpg,Drama,,Cory Weissman is a college basketball player w...,,,5.9,19,268245,tt2391950
4,100% arabica,100% arabica,1997,,,French,,Mahmoud Zemmouri,,/f5oXyQ1EZqb6gGlKZzxDoI161dQ.jpg,Drama,,"The movie takes place in a poverty-stricken, r...",,,6.0,1,99800,tt0156245


<class 'pandas.core.frame.DataFrame'>
Int64Index: 17668 entries, 0 to 197765
Data columns (total 19 columns):
title             17668 non-null object
original_title    17668 non-null object
year              17668 non-null int64
companies         11589 non-null object
country           13571 non-null object
language          17667 non-null object
run_time          16507 non-null Int64
crew              16720 non-null object
cast              16002 non-null object
poster            17296 non-null object
genres            16109 non-null object
collection        1189 non-null object
synopsis          17537 non-null object
budget            2863 non-null Int64
gross             1908 non-null Int64
score             17668 non-null float64
votes             17668 non-null Int64
tmdb_id           17668 non-null Int64
imdb_id           17456 non-null object
dtypes: Int64(5), float64(1), int64(1), object(12)
memory usage: 2.8+ MB


In [51]:
save_path = r'../data/moviedb_movie.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'9.65 mb'