## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
%matplotlib inline

In [2]:
# define all dataframes for later

# gross profits from Box Office Movies (an IMDB subsidiary)
# gross_df = pd.read_csv('data/zipped_files/bom.movie_gross.csv.gz')

# the movie database movie database
# movies_df = pd.read_csv('data/zipped_files/tmdb.movies.csv.gz')

# movie budgets and gross profits from The Numbers
budgets_df = pd.read_csv('data/zipped_files/tn.movie_budgets.csv.gz')

### vvv probably a 99% chance these ones don't get used vvv ###

# tsv files have to be read with \t (tab spcaes) as their delimiter
# info_df = pd.read_csv('data/zipped_files/rt.movie_info.tsv.gz',delimiter='\t')
# file has an issue with encoding, read as cp850 or it throws an error
# reviews_df = pd.read_csv('data/zipped_files/rt.reviews.tsv.gz', delimiter='\t', encoding='cp850')

### IM.db database structure
![database schema flow chart](./images/db_schema.jpeg)

In [5]:
#################################
# REQUIRES UNZIPPING data/zipped_files/im.db.zip
# INTO DIRECOTRY data/unzipped AS im.db
#################################
conn = sqlite3.connect('data/unzipped/im.db')
# call this later => imdb_df = pd.read_sql('''<QUERY>''',conn)

In [6]:
### GETTING NAMES OF ALL TABLE NAMES
tables = pd.read_sql(
"""
SELECT * FROM sqlite_master
""", conn)

In [7]:
# create imdb_directors with director names and movie IDs
imdb_directors = pd.read_sql("""
SELECT d.movie_id, p.primary_name FROM directors as d
LEFT JOIN persons AS p
    ON d.person_id = p.person_id
GROUP BY d.movie_id
""", conn)

In [10]:
# attach each possible movie title with their respective movie id,
# lets us look up the potentially mismatched titles from budgets_df
# doesn't need to be distinguished into original_ or primary_title
# beyond this
movie_akas = pd.read_sql("""
SELECT DISTINCT movie_id, title FROM movie_akas
""",conn)

In [11]:
# list-ifys the movie titles and movie IDs 
movie_akas_list = list(movie_akas['title'])
movie_ids_list = list(movie_akas['movie_id'])

In [12]:
budgets_df['movie_id'] = [movie_ids_list[movie_akas_list.index( title )] if title in movie_akas_list else None for title in budgets_df['movie']]

In [13]:
budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
 6   movie_id           2502 non-null   object
dtypes: int64(1), object(6)
memory usage: 316.3+ KB


In [14]:
len(budgets_df['movie_id'].unique())

2407

In [15]:
# budgets_df['fixed_titles'] = budgets_df['movie'].map(akas_dict)

In [16]:
budgets_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,movie_id
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",tt1775309
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",tt1298650
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",tt6565702
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",tt2395427
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747",
...,...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0,tt7837402
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495",
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338",
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0,tt2107644


In [17]:
recent_imdb_movies = pd.read_sql("""
SELECT * FROM movie_basics
WHERE CAST(start_year AS int) BETWEEN 2013 AND 2023
""", conn)

In [18]:
recent_imdb_movies.sort_values('start_year')

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
17314,tt2971400,Loving Memory,Loving Memory,2013,,Drama
17313,tt2971388,308,308,2013,121.0,"Horror,Mystery,Thriller"
17312,tt2971128,"River Deep, Mountain High: James Nesbitt in Ne...","River Deep, Mountain High: James Nesbitt in Ne...",2013,,Documentary
17311,tt2970792,Wechselspiel,Wechselspiel,2013,73.0,Drama
...,...,...,...,...,...,...
5999,tt1757678,Avatar 3,Avatar 3,2023,,"Action,Adventure,Drama"
74209,tt6495056,Untitled Illumination Entertainment Project,Untitled Illumination Entertainment Project,2023,,
2392,tt10298848,Untitled Disney Live-Action Project,Untitled Disney Live-Action Project,2023,,
70264,tt6258542,Wraith of the Umbra and Eidolon II,Wraith of the Umbra and Eidolon II,2023,,"Adventure,Drama,Fantasy"


In [23]:
budgets_df.dropna(subset=["movie_id"],axis=0,inplace=True)

In [24]:
# convert dollar amounts (str) into int
budgets_df['domestic_gross'] = budgets_df['domestic_gross'].str.replace('$', '').str.replace(',', '').astype(int)
budgets_df['worldwide_gross'] = budgets_df['worldwide_gross'].str.replace('$', '').str.replace(',', '').astype(int)
budgets_df['production_budget'] = budgets_df['production_budget'].str.replace('$', '').str.replace(',', '').astype(int)
# calculate foreign gross based off worldwide - domestic values
budgets_df['foreign_gross'] = budgets_df['worldwide_gross'] - budgets_df['domestic_gross']

In [25]:
budgets_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,movie_id,foreign_gross
0,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,tt1775309,2015837654
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,tt1298650,804600000
2,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,tt6565702,107000000
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,tt2395427,944008095
6,7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200,tt4154756,1369318718
...,...,...,...,...,...,...,...,...
5767,68,"Jul 6, 2001",Cure,10000,94596,94596,tt2027234,0
5769,70,"Apr 1, 1996",Bang,10000,527,527,tt6616538,0
5772,73,"Jan 13, 2012",Newlyweds,9000,4584,4584,tt1880418,0
5777,78,"Dec 31, 2018",Red 11,7000,0,0,tt7837402,0


In [26]:
movies_with_gross_df = budgets_df.merge(recent_imdb_movies, on="movie_id", how='inner')
#recent_imdb_movies.merge(recent_gross_df,left_on='primary_title',right_on='title',how='inner')
movies_with_gross_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,movie_id,foreign_gross,primary_title,original_title,start_year,runtime_minutes,genres
0,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,tt6565702,107000000,Dark Phoenix,Dark Phoenix,2019,113.0,"Action,Adventure,Sci-Fi"
1,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,tt2395427,944008095,Avengers: Age of Ultron,Avengers: Age of Ultron,2015,141.0,"Action,Adventure,Sci-Fi"
2,7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200,tt4154756,1369318718,Avengers: Infinity War,Avengers: Infinity War,2018,149.0,"Action,Adventure,Sci-Fi"
3,9,"Nov 17, 2017",Justice League,300000000,229024295,655945209,tt0974015,426920914,Justice League,Justice League,2017,120.0,"Action,Adventure,Fantasy"
4,10,"Nov 6, 2015",Spectre,300000000,200074175,879620923,tt2379713,679546748,Spectre,Spectre,2015,148.0,"Action,Adventure,Thriller"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,62,"Dec 31, 2014",Stories of Our Lives,15000,0,0,tt3973612,0,Stories of Our Lives,Stories of Our Lives,2014,60.0,Drama
1521,67,"Apr 28, 2006",Clean,10000,138711,138711,tt6619196,0,Clean,Clean,2017,70.0,"Comedy,Drama,Horror"
1522,70,"Apr 1, 1996",Bang,10000,527,527,tt6616538,0,Bang,Bang,2015,,
1523,78,"Dec 31, 2018",Red 11,7000,0,0,tt7837402,0,Red 11,Red 11,2019,77.0,"Horror,Sci-Fi,Thriller"


In [385]:
fixed_titles.dropna(subset=['movie_id']).sort_values('domestic_gross',ascending=False)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,movie_id,title,primary_title,original_title
780,43,"Aug 3, 2018",Christopher Robin,"$75,000,000","$99,215,042","$197,504,758",tt4575576,Christopher Robin,Christopher Robin,Christopher Robin
868,8,"Jun 13, 1997",Hercules,"$70,000,000","$99,112,101","$250,700,000",tt3499424,Hercules,Hercules Reborn,Hercules Reborn
869,8,"Jun 13, 1997",Hercules,"$70,000,000","$99,112,101","$250,700,000",tt3985956,Hercules,Hercules,Hercules
867,8,"Jun 13, 1997",Hercules,"$70,000,000","$99,112,101","$250,700,000",tt1267297,Hercules,Hercules,Hercules
870,9,"Mar 22, 2013",Olympus Has Fallen,"$70,000,000","$98,927,592","$172,878,928",tt2302755,Olympus Has Fallen,Olympus Has Fallen,Olympus Has Fallen
...,...,...,...,...,...,...,...,...,...,...
6101,69,"Sep 18, 1967",Point Blank,"$3,000,000",$0,$0,tt7381042,Point Blank,Point Blank,Point Blank
6104,72,"Aug 14, 2015",Amnesiac,"$3,000,000",$0,$0,tt2837336,Amnesiac,Amnesiac,Amnesiac
6105,72,"Aug 14, 2015",Amnesiac,"$3,000,000",$0,$0,tt2693114,Amnesiac,Amnesiac,Amnesiac
6106,73,"Dec 31, 2015",Unnatural,"$3,000,000",$0,$0,tt4373974,Unnatural,Beyond the Bridge,Beyond the Bridge


In [324]:
# movie_akas.groupby(by='movie_id').agg(lambda x: list(x))