In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.Connection("./data/im.db")

# Creating dataframes from im.db.move_basics and im.db.movie_ratings



In [3]:
# Query to pull relevant data from im.db

# I think there's probably a better/more efficient query that would pull everything we need but I was getting stuck.
# Instead I decided to do two queries and merge the resulting dataframes.

q2 = """
SELECT
    movie_id,
    primary_title,
    start_year,
    runtime_minutes,
    genres,
    directors.person_id AS director_id,
    writers.person_id AS writer_id,  
    movie_ratings.averagerating,
    movie_ratings.numvotes
    
FROM
    movie_basics
    JOIN
        movie_ratings
            USING(movie_id)
    JOIN
        directors
            USING(movie_id)
    JOIN
        writers
            USING(movie_id)
    
WHERE
    start_year >= 2010 AND
    start_year <= 2022


GROUP BY
    movie_basics.movie_id
;
"""
q2_result = pd.read_sql(q2, conn)
q2_result

Unnamed: 0,movie_id,primary_title,start_year,runtime_minutes,genres,director_id,writer_id,averagerating,numvotes
0,tt0063540,Sunghursh,2013,175.0,"Action,Crime,Drama",nm0712540,nm0023551,7.0,77
1,tt0069049,The Other Side of the Wind,2018,122.0,Drama,nm0000080,nm0000080,6.9,4517
2,tt0069204,Sabse Bada Sukh,2018,,"Comedy,Drama",nm0611531,nm0347899,6.1,13
3,tt0100275,The Wandering Soap Opera,2017,80.0,"Comedy,Drama,Fantasy",nm0749914,nm0749914,6.5,119
4,tt0137204,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy",nm0365480,nm0365480,8.1,263
...,...,...,...,...,...,...,...,...,...
63083,tt9911774,Padmavyuhathile Abhimanyu,2019,130.0,Drama,nm10536451,nm10536451,8.4,365
63084,tt9913084,Diabolik sono io,2019,75.0,Documentary,nm0812850,nm0812850,6.2,6
63085,tt9914286,Sokagin Çocuklari,2019,98.0,"Drama,Family",nm4394529,nm1902682,8.7,136
63086,tt9914642,Albatross,2017,,Documentary,nm5300859,nm5300859,8.5,8


In [4]:
# query to create a mapping table so that we can replace director/writer ID with real names.

q3 = """
SELECT
    person_id,
    primary_name
        
FROM
    persons
   
    
GROUP BY
    person_id
;
"""
q3_result = pd.read_sql(q3, conn)
q3_result

Unnamed: 0,person_id,primary_name
0,nm0000002,Lauren Bacall
1,nm0000003,Brigitte Bardot
2,nm0000005,Ingmar Bergman
3,nm0000006,Ingrid Bergman
4,nm0000007,Humphrey Bogart
...,...,...
606643,nm9993494,Amjad Ali
606644,nm9993573,Lakisha Louissaint
606645,nm9993616,Ryan Mac Lennan
606646,nm9993650,Marcin Balcerak


In [5]:
# merge primary_name from the persons table onto our dataframe where director_id matches person_id
directors = q2_result.merge(q3_result, how='left', left_on='director_id', right_on='person_id')

In [6]:
# rename primary_name to director_name
directors = directors.rename({"primary_name":"director_name"}, axis="columns")

In [7]:
# merge primary_name from the persons table onto our dataframe where writer_id matches person_id
writers = directors.merge(q3_result, how='left', left_on='writer_id', right_on='person_id')

In [8]:
# rename primary_name to director_name
df = writers.rename({"primary_name":"writer_name"}, axis="columns")

In [9]:
# drop extraneous columns
df = df.drop(columns=['director_id', 'writer_id', 'person_id_x', 'person_id_y'])

In [10]:
# shape matches the length of our original table.
print(df.shape) 
df.head()

(63088, 9)


Unnamed: 0,movie_id,primary_title,start_year,runtime_minutes,genres,averagerating,numvotes,director_name,writer_name
0,tt0063540,Sunghursh,2013,175.0,"Action,Crime,Drama",7.0,77,Harnam Singh Rawail,Abrar Alvi
1,tt0069049,The Other Side of the Wind,2018,122.0,Drama,6.9,4517,Orson Welles,Orson Welles
2,tt0069204,Sabse Bada Sukh,2018,,"Comedy,Drama",6.1,13,Hrishikesh Mukherjee,Gulzar
3,tt0100275,The Wandering Soap Opera,2017,80.0,"Comedy,Drama,Fantasy",6.5,119,Raoul Ruiz,Raoul Ruiz
4,tt0137204,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy",8.1,263,Anthony Harrison,Anthony Harrison


In [11]:
# some missing values here that we can either deal with or try to fill in from the other sources.
df.isna().sum()

movie_id              0
primary_title         0
start_year            0
runtime_minutes    5789
genres              471
averagerating         0
numvotes              0
director_name         1
writer_name           0
dtype: int64

In [12]:
conn.close()

# ** EVERYTHING BELOW IS NOT USED--just converted things to dataframes in case I wanted to use them **.

# Making Dataframe of tmdb.movies.csv

In [9]:
## create dataframe of tmdb.movies.csv
df_movies = pd.read_csv('./data/tmdb.movies.csv', index_col=0)
df_movies.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [10]:
df_movies.shape

(26517, 9)

# Making dataframe of rt.movie_info.tsv

In [11]:
df_info = pd.read_csv('./data/rt.movie_info.tsv', delimiter='\t')

In [12]:
df_info.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


# Making dataframe of tn.movie_budgets.csv

In [13]:
df_budget = pd.read_csv('./data/tn.movie_budgets.csv', index_col=0)

In [14]:
df_budget.head()

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [15]:
df_budget.shape

(5782, 5)

# Making dataframe of bom.movie_gross.csv

In [16]:
df_gross = pd.read_csv('./data/bom.movie_gross.csv')


In [17]:
df_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [18]:
df_gross.shape

(3387, 5)