In [1]:
# Necessary imports
import pandas as pd
import seaborn as sns

In [2]:
import glob
# Use glob to get all filepaths that match the pattern (*=wildcard)
tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files

['Data\\final_tmdb_data_2001.csv.gz',
 'Data\\final_tmdb_data_2002.csv.gz',
 'Data\\final_tmdb_data_2010.csv.gz']

In [3]:
# Use read_csv in a list comprehension and combine with concat to load all files
merged = pd.concat([pd.read_csv(f) for f in tmdb_files] )
merged

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0096056,0.0,/95U3MUDXu4xSCmVLtWgargRipDi.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109809.0,en,Crime and Punishment,...,0.0,126.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Crime and Punishment,0.0,5.385,13.0,
2,tt0118926,0.0,/qR3Dk3ctnrrxkAI6I472RhamIbu.jpg,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,20689.0,en,The Dancer Upstairs,...,5227348.0,132.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"An honest man caught in a world of intrigue, p...",The Dancer Upstairs,0.0,6.200,51.0,
3,tt0119980,0.0,,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,563364.0,en,Random Shooting in LA,...,0.0,91.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Random Shooting in LA,0.0,0.000,0.0,
4,tt0120679,0.0,/s04Ds4xbJU7DzeGVyamccH4LoxF.jpg,,12000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.miramax.com/movie/frida,1360.0,en,Frida,...,56298474.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Prepare to be seduced.,Frida,0.0,7.422,1915.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,tt7851834,0.0,/c4x2XPngdiW8Vmc6EmmTXAgzViJ.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,166919.0,ko,여의도,...,0.0,88.0,"[{'english_name': 'Korean', 'iso_639_1': 'ko',...",Released,,A Friend In Need,0.0,5.000,3.0,
2735,tt8090084,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,516206.0,en,Goodbye Dolly,...,0.0,46.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Goodbye Dolly,0.0,0.000,0.0,
2736,tt8160720,0.0,,,0.0,[],,273502.0,en,Moist Fury,...,0.0,65.0,[],Released,,Moist Fury,0.0,6.000,2.0,
2737,tt9164254,1.0,,,0.0,[],,775304.0,en,"Text, Lies and Video",...,0.0,103.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Download Deception. Upload Revenge.,"Text, Lies and Video",0.0,0.000,0.0,NC-17


In [4]:
# Saving dataframe to folder as csv.gz
merged.to_csv('Data/tmdb_results_combined.csv.gz', index=False)

In [5]:
# As we're moving data to sequel, we need its dtypes for a dictionary
from sqlalchemy.types import *

In [6]:
# Create a schema dictionary using SQLAlchemy datatype objects
merged_dtypes_dict = {'imdb_id': CHAR(10), 
                       'revenue': FLOAT(),
                       'budget': FLOAT(),
                       'certification': CHAR(10)}

In [7]:
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
connection = "mysql+pymysql://root:root@localhost/movies"

In [8]:
engine = create_engine(connection)
database_exists(connection)

True

In [9]:
# create a connection to the database with the engine
conn = engine.connect()

In [10]:
# Making new dataframe to put to sequel with only columns we desire
merged_new = merged.copy()
merged_new = merged_new[['imdb_id', 'revenue', 'budget', 'certification']]
merged_new.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0096056,0.0,0.0,
2,tt0118926,5227348.0,0.0,
3,tt0119980,0.0,0.0,
4,tt0120679,56298474.0,12000000.0,R


In [11]:
merged_new.to_sql('tmdb_data', conn, dtype = merged_dtypes_dict, if_exists = 'replace', index = False)

13355

In [19]:
# How many movies had at least some valid financial information (values > 0 for budget OR revenue)?
q = '''SELECT tb.primary_title, tm.revenue
        FROM title_basics AS tb
        RIGHT JOIN tmdb_data AS tm 
        ON tb.tmdb_data_imdb_id = tm.imdb_id;
        '''
pd.read_sql(q, conn)

Unnamed: 0,primary_title,revenue
0,,
1,,0.0
2,,5227350.0
3,,0.0
4,,56298500.0
...,...,...
13350,,0.0
13351,,0.0
13352,,0.0
13353,,0.0


In [13]:
q = '''SELECT * FROM tmdb_data
        LIMIT 5;'''
pd.read_sql(q, conn)

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0096056,0.0,0.0,
2,tt0118926,5227350.0,0.0,
3,tt0119980,0.0,0.0,
4,tt0120679,56298500.0,12000000.0,R


In [14]:
q = '''SELECT * FROM title_basics
        LIMIT 5;'''
pd.read_sql(q, conn)

Unnamed: 0,tconst,primary_title,start_year,runtime,ratings_tconst,tmdb_data_imdb_id
0,tt0035423,Kate & Leopold,2001.0,118,,
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,,
2,tt0069049,The Other Side of the Wind,2018.0,122,,
3,tt0088751,The Naked Monster,2005.0,100,,
4,tt0096056,Crime and Punishment,2002.0,126,,


In [15]:
# How many movies are there in each of the certification categories (G/PG/PG-13/R)?


In [16]:
# What is the average revenue per certification category?


In [17]:
# What is the average budget per certification category?
