In [320]:
import pandas as pd
import gzip as gz
import sqlite3
from zipfile import ZipFile

In [321]:
imdb = 'data/im.db.zip'
with ZipFile(imdb, 'r') as zip:
    zip.extractall('data/imdb_unzipped')

conn = sqlite3.connect('data/imdb_unzipped/im.db')
        
q = """
SELECT *
FROM sqlite_master
"""
pd.read_sql(q, conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [322]:
q = """
SELECT *
FROM movie_basics
ORDER BY start_year
"""
pd.read_sql(q, conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0146592,Pál Adrienn,Pál Adrienn,2010,136.0,Drama
1,tt0154039,So Much for Justice!,Oda az igazság,2010,100.0,History
2,tt0162942,Children of the Green Dragon,A zöld sárkány gyermekei,2010,89.0,Drama
3,tt0230212,The Final Journey,The Final Journey,2010,120.0,Drama
4,tt0312305,Quantum Quest: A Cassini Space Odyssey,Quantum Quest: A Cassini Space Odyssey,2010,45.0,"Adventure,Animation,Sci-Fi"
...,...,...,...,...,...,...
146139,tt6149054,Fantastic Beasts and Where to Find Them 5,Fantastic Beasts and Where to Find Them 5,2024,,"Adventure,Family,Fantasy"
146140,tt3095356,Avatar 4,Avatar 4,2025,,"Action,Adventure,Fantasy"
146141,tt10300398,Untitled Star Wars Film,Untitled Star Wars Film,2026,,Fantasy
146142,tt5637536,Avatar 5,Avatar 5,2027,,"Action,Adventure,Fantasy"


In [323]:
#Include movies released over a 10 year period from 2012 to 2021
#Dropped redundant original_title column.

q = """
SELECT movie_id, primary_title AS 'Movie Title', start_year, runtime_minutes, genres
FROM movie_basics
WHERE start_year BETWEEN 2012 AND 2021
ORDER BY start_year 
"""
movie_clean = pd.read_sql(q, conn)

In [324]:
#List of genres in movie_clean
list_of_genres = ['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [325]:
with gz.open('data/tn.movie_budgets.csv.gz') as f:
    budgets = pd.read_csv(f,encoding='latin1')

# Remove extraneous '$' and ',' symbols
budgets['production_budget'] = budgets['production_budget'].str.replace(',', '')
budgets['production_budget'] = budgets['production_budget'].str.replace("$", '')

budgets['domestic_gross'] = budgets['domestic_gross'].str.replace(",", '')
budgets['domestic_gross'] = budgets['domestic_gross'].str.replace("$", '')

budgets['worldwide_gross'] = budgets['worldwide_gross'].str.replace(",", '')
budgets['worldwide_gross'] = budgets['worldwide_gross'].str.replace("$", '')

# Convert money columns to integers
cash_columns = ['production_budget', 'domestic_gross', 'worldwide_gross']
budgets[cash_columns] = budgets[cash_columns].apply(pd.to_numeric)

# Convert to datetime
budgets['release_date'] = pd.to_datetime(budgets['release_date'])

# Create column to evaluate profitability
budgets['net_earnings'] = budgets['worldwide_gross'] - budgets['production_budget']

# Filter budgets to include only those in our target range
recent = budgets.loc[(budgets['release_date'] >= '2012-01-01')]
recent

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,net_earnings
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,1072413963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,999721747
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,1747311220
6,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,1748134200
...,...,...,...,...,...,...,...
5761,62,2014-12-31,Stories of Our Lives,15000,0,0,-15000
5771,72,2015-05-19,Family Motocross,10000,0,0,-10000
5772,73,2012-01-13,Newlyweds,9000,4584,4584,-4416
5777,78,2018-12-31,Red 11,7000,0,0,-7000


In [326]:
#Merged Movies and BudgetsDF
Genre_Budget = movie_clean.merge(recent, left_on='Movie Title', right_on='movie', how='inner')

#Dropped redundant movie and start_year, and id columns
Genre_Budget = Genre_Budget.drop(['movie', 'start_year', 'id'], axis=1)
Genre_Budget.tail()

Unnamed: 0,movie_id,Movie Title,runtime_minutes,genres,release_date,production_budget,domestic_gross,worldwide_gross,net_earnings
1888,tt8155288,Happy Death Day 2U,100.0,"Drama,Horror,Mystery",2019-02-13,9000000,28051045,64179495,55179495
1889,tt8266310,Blinded by the Light,117.0,"Biography,Comedy,Drama",2019-08-14,15000000,0,0,-15000000
1890,tt9024106,Unplanned,106.0,"Biography,Drama",2019-03-29,6000000,18107621,18107621,12107621
1891,tt10329540,Rogue City,,,2019-12-31,13000000,0,0,-13000000
1892,tt7504726,Call of the Wild,,"Adventure,Animation,Family",2020-02-21,82000000,0,0,-82000000


In [327]:
#Moved Genres to end of dataframe
Genre_Budget['Genres'] = Genre_Budget['genres']
Genre_Budget = Genre_Budget.drop(['genres'], axis=1)

In [328]:
#Added genre columns to dataframe
df_add = pd.DataFrame(columns=list_of_genres)
Genre_Budget = pd.concat([Genre_Budget,df_add], axis=1)

#Assign genre categories based on Genre column
for g in list_of_genres:
    Genre_Budget[g] = Genre_Budget['Genres'].str.contains(g)
    Genre_Budget[g] = Genre_Budget[g]*1
Genre_Budget[list_of_genres] = Genre_Budget[list_of_genres].apply(pd.to_numeric)

#Correlate values
Genre_Budget.corr()

Unnamed: 0,runtime_minutes,production_budget,domestic_gross,worldwide_gross,net_earnings,Action,Adventure,Animation,Biography,Comedy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
runtime_minutes,1.0,0.253128,0.195617,0.224507,0.199602,0.219673,0.148866,-0.099437,0.088989,-0.010004,...,0.009221,-0.034907,-0.021454,0.047374,0.119225,0.047719,,0.019644,0.074805,0.040261
production_budget,0.253128,1.0,0.72322,0.789442,0.669944,0.367541,0.53353,0.216757,-0.059048,-0.020999,...,-0.09707,-0.014463,-0.016371,-0.115381,0.254746,-0.040481,,-0.088197,-0.021786,0.019664
domestic_gross,0.195617,0.72322,1.0,0.94805,0.940625,0.211707,0.367243,0.217706,-0.027626,0.04225,...,-0.055263,-0.017441,-0.013364,-0.072516,0.224331,-0.015493,,-0.065132,-0.033332,-0.019175
worldwide_gross,0.224507,0.789442,0.94805,1.0,0.984593,0.257208,0.42732,0.240104,-0.046595,0.014312,...,-0.058019,-0.014451,-0.012302,-0.080444,0.250632,-0.033137,,-0.051516,-0.029388,-0.017352
net_earnings,0.199602,0.669944,0.940625,0.984593,1.0,0.206355,0.364797,0.228626,-0.039529,0.02329,...,-0.042513,-0.013357,-0.010214,-0.064418,0.230535,-0.028542,,-0.037176,-0.029334,-0.026588
Action,0.219673,0.367541,0.211707,0.257208,0.206355,1.0,0.309203,-0.035467,-0.100081,-0.11513,...,-0.091943,-0.022095,-0.01275,-0.153627,0.253174,-0.017132,,0.048109,-0.010786,0.014702
Adventure,0.148866,0.53353,0.367243,0.42732,0.364797,0.309203,1.0,0.370569,-0.071146,0.083701,...,-0.092806,-0.018591,-0.010728,-0.126075,0.156608,-0.061815,,-0.168041,-0.049426,-0.005125
Animation,-0.099437,0.216757,0.217706,0.240104,0.228626,-0.035467,0.370569,1.0,-0.070403,0.230395,...,-0.064935,-0.009258,-0.005342,-0.076753,-0.056163,-0.035433,,-0.104135,-0.024614,-0.019325
Biography,0.088989,-0.059048,-0.027626,-0.046595,-0.039529,-0.100081,-0.071146,-0.070403,1.0,-0.079146,...,-0.078205,-0.012199,-0.007039,-0.056298,-0.086233,0.106794,,-0.106522,-0.01424,-0.025463
Comedy,-0.010004,-0.020999,0.04225,0.014312,0.02329,-0.11513,0.083701,0.230395,-0.079146,1.0,...,-0.133039,-0.023643,-0.013643,0.158782,-0.125253,-0.049769,,-0.243135,-0.051275,-0.019969


In [329]:
#Top Genres for worldwide_gross
correlation = Genre_Budget.corr()
correlation['worldwide_gross'].sort_values(ascending=False).head(12)

worldwide_gross      1.000000
net_earnings         0.984593
domestic_gross       0.948050
production_budget    0.789442
Adventure            0.427320
Action               0.257208
Sci-Fi               0.250632
Animation            0.240104
runtime_minutes      0.224507
Fantasy              0.154777
Musical              0.085785
Family               0.071141
Name: worldwide_gross, dtype: float64

In [330]:
#Top Genres for net_earnings
correlation = Genre_Budget.corr()
correlation['net_earnings'].sort_values(ascending=False).head(12)

net_earnings         1.000000
worldwide_gross      0.984593
domestic_gross       0.940625
production_budget    0.669944
Adventure            0.364797
Sci-Fi               0.230535
Animation            0.228626
Action               0.206355
runtime_minutes      0.199602
Fantasy              0.125282
Musical              0.094591
Family               0.059374
Name: net_earnings, dtype: float64

In [331]:
Genre_Budget.describe()

Unnamed: 0,runtime_minutes,production_budget,domestic_gross,worldwide_gross,net_earnings,Action,Adventure,Animation,Biography,Comedy,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
count,1748.0,1893.0,1893.0,1893.0,1893.0,1873.0,1873.0,1873.0,1873.0,1873.0,...,1873.0,1873.0,1873.0,1873.0,1873.0,1873.0,1873.0,1873.0,1873.0,1873.0
mean,101.709382,37213640.0,46762170.0,115261800.0,78048160.0,0.233316,0.177256,0.050721,0.084891,0.258409,...,0.073145,0.001602,0.000534,0.099306,0.074212,0.022958,0.0,0.168713,0.011212,0.006941
std,24.380479,50956170.0,80822740.0,216343300.0,178872300.0,0.423054,0.381987,0.219486,0.278793,0.437877,...,0.260443,0.04,0.023106,0.299152,0.262186,0.149809,0.0,0.374599,0.105319,0.083044
min,3.0,1400.0,0.0,0.0,-200237600.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,90.0,4600000.0,102118.0,1200000.0,-2100000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,101.0,18000000.0,15024050.0,30601800.0,9575290.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,115.0,45000000.0,54724700.0,109501100.0,74008790.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,280.0,350000000.0,700059600.0,2048134000.0,1748134000.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
