##### Importing libraries to be utilized

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import scipy as stats
import seaborn as sns
import zipfile
import sqlite3

##### Opening the movie_gross csv and assigning to a data fram
##### Replacing the empty values in foreign and domestic gross with 0 instead of NaN
##### Dropping the 5 rows that had null values included in the Studio

In [3]:
movie_gross_data = pd.read_csv("zippedData/bom.movie_gross.csv.gz")
movie_gross_data["foreign_gross"].fillna(0, inplace = True)
movie_gross_data["domestic_gross"].fillna(0, inplace = True)
movie_gross_data.dropna(inplace = True)
print(movie_gross_data)

                                            title      studio  domestic_gross  \
0                                     Toy Story 3          BV     415000000.0   
1                      Alice in Wonderland (2010)          BV     334200000.0   
2     Harry Potter and the Deathly Hallows Part 1          WB     296000000.0   
3                                       Inception          WB     292600000.0   
4                             Shrek Forever After        P/DW     238700000.0   
...                                           ...         ...             ...   
3382                                    The Quake       Magn.          6200.0   
3383                  Edward II (2018 re-release)          FM          4800.0   
3384                                     El Pacto        Sony          2500.0   
3385                                     The Swan  Synergetic          2400.0   
3386                            An Actor Prepares       Grav.          1700.0   

     foreign_gross  year  


In [4]:
movie_gross_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3382 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3382 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3382 non-null   float64
 3   foreign_gross   3382 non-null   object 
 4   year            3382 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 158.5+ KB


##### Removing any commas that are included in the foreign and domestic gross columns to standardize the data

In [None]:
movie_gross_data["domestic_gross"].replace(',','', regex = True, inplace = True)
movie_gross_data["foreign_gross"].replace(',','', regex = True, inplace = True)

##### Creating a new column total gross which combines any foreign and domestic sales by movie

In [6]:
movie_gross_data["total_gross"] = movie_gross_data["foreign_gross"].astype(float) + movie_gross_data["domestic_gross"].astype(float)
print(movie_gross_data)

                                            title      studio  domestic_gross  \
0                                     Toy Story 3          BV     415000000.0   
1                      Alice in Wonderland (2010)          BV     334200000.0   
2     Harry Potter and the Deathly Hallows Part 1          WB     296000000.0   
3                                       Inception          WB     292600000.0   
4                             Shrek Forever After        P/DW     238700000.0   
...                                           ...         ...             ...   
3382                                    The Quake       Magn.          6200.0   
3383                  Edward II (2018 re-release)          FM          4800.0   
3384                                     El Pacto        Sony          2500.0   
3385                                     The Swan  Synergetic          2400.0   
3386                            An Actor Prepares       Grav.          1700.0   

     foreign_gross  year   

##### Using zipfile to unzip the imdb database

In [7]:
with zipfile.ZipFile("zippedData/im.db.zip", "r") as zObject:
        zObject.extractall()

##### Creating a connection to the database

In [8]:
conn = sqlite3.connect("im.db")
cur = conn.cursor()

### My question - is there a benefit to releasing domestic vs foreign or a particular genre that does well by region

##### Creating a dataframe that only includes movie that have launched both domesting and foreign to compare by genre 

In [20]:
movie_domestic_foreign_comp = movie_gross_data[movie_gross_data["foreign_gross"] != 0]
movie_domestic_foreign_comp = movie_domestic_foreign_comp[movie_domestic_foreign_comp["domestic_gross"] != 0]
print(movie_domestic_foreign_comp.sort_values(by = ["domestic_gross"], ascending = False))

                             title  studio  domestic_gross foreign_gross  \
1872  Star Wars: The Force Awakens      BV     936700000.0        1131.6   
3080                 Black Panther      BV     700100000.0     646900000   
3079        Avengers: Infinity War      BV     678800000.0        1369.5   
1873                Jurassic World    Uni.     652300000.0        1019.4   
727          Marvel's The Avengers      BV     623400000.0     895500000   
...                            ...     ...             ...           ...   
292               Perrier's Bounty     IFC           800.0        167000   
1738                       Jackpot      DR           800.0       1100000   
2920     Amityville: The Awakening  W/Dim.           700.0       7700000   
642                 Illegal (2011)      FM           700.0        289000   
1018                 Apartment 143   Magn.           400.0        426000   

      year   total_gross  
1872  2015  9.367011e+08  
3080  2018  1.347000e+09  
3079  

##### Accessing the movie basics information from the imdb database and turning it to a df

In [26]:
Query_Ratings = """
SELECT *
FROM movie_basics
;
"""
imdb_movie_infor = pd.read_sql(Query_Ratings, conn)
print(imdb_movie_infor)

         movie_id                                primary_title  \
0       tt0063540                                    Sunghursh   
1       tt0066787              One Day Before the Rainy Season   
2       tt0069049                   The Other Side of the Wind   
3       tt0069204                              Sabse Bada Sukh   
4       tt0100275                     The Wandering Soap Opera   
...           ...                                          ...   
146139  tt9916538                          Kuambil Lagi Hatiku   
146140  tt9916622  Rodolpho Teóphilo - O Legado de um Pioneiro   
146141  tt9916706                              Dankyavar Danka   
146142  tt9916730                                       6 Gunn   
146143  tt9916754               Chico Albuquerque - Revelações   

                                     original_title  start_year  \
0                                         Sunghursh        2013   
1                                   Ashad Ka Ek Din        2019   
2     

##### Cleaning the df from the imdb data

In [47]:
imdb_movie_infor.drop_duplicates(subset = ["primary_title"], inplace = True)
#imdb_movie_infor.drop(["original_title"], axis = 1, inplace = True)
imdb_movie_infor = imdb_movie_infor[imdb_movie_infor["genres"].str.contains("None") == False]
print(imdb_movie_infor)

         movie_id                                primary_title  start_year  \
0       tt0063540                                    Sunghursh        2013   
1       tt0066787              One Day Before the Rainy Season        2019   
2       tt0069049                   The Other Side of the Wind        2018   
3       tt0069204                              Sabse Bada Sukh        2018   
4       tt0100275                     The Wandering Soap Opera        2017   
...           ...                                          ...         ...   
146138  tt9916428                          The Secret of China        2019   
146139  tt9916538                          Kuambil Lagi Hatiku        2019   
146140  tt9916622  Rodolpho Teóphilo - O Legado de um Pioneiro        2015   
146141  tt9916706                              Dankyavar Danka        2013   
146143  tt9916754               Chico Albuquerque - Revelações        2013   

        runtime_minutes                 genres  
0             

##### Completing a merge between the dataframs where the title matches

In [48]:
merged_imdb_domestic_and_foreign = pd.merge(imdb_movie_infor, movie_domestic_foreign_comp, how = "inner", left_on= "primary_title", right_on= "title")
print(merged_imdb_domestic_and_foreign)

       movie_id                    primary_title  start_year  runtime_minutes  \
0     tt0337692                      On the Road        2012            124.0   
1     tt0359950  The Secret Life of Walter Mitty        2013            114.0   
2     tt0365907      A Walk Among the Tombstones        2014            114.0   
3     tt0369610                   Jurassic World        2015            124.0   
4     tt0372538                              Spy        2011            110.0   
...         ...                              ...         ...              ...   
1535  tt7784604                       Hereditary        2018            127.0   
1536  tt8097306                    Nobody's Fool        2018            110.0   
1537  tt8404272           How Long Will I Love U        2018            101.0   
1538  tt8851262                     Spring Fever        2019              NaN   
1539  tt9151704        Burn the Stage: The Movie        2018             84.0   

                       genr

##### Cleaning up the new dataframe by removing the duplicated column

In [54]:
#merged_imdb_domestic_and_foreign.drop(["title"], axis = 1, inplace = True)
#merged_imdb_domestic_and_foreign.drop(["year"], axis = 1, inplace = True)
#merged_imdb_domestic_and_foreign.drop(["movie_id"], axis = 1, inplace = True)
print(merged_imdb_domestic_and_foreign)

                        primary_title  start_year  runtime_minutes  \
0                         On the Road        2012            124.0   
1     The Secret Life of Walter Mitty        2013            114.0   
2         A Walk Among the Tombstones        2014            114.0   
3                      Jurassic World        2015            124.0   
4                                 Spy        2011            110.0   
...                               ...         ...              ...   
1535                       Hereditary        2018            127.0   
1536                    Nobody's Fool        2018            110.0   
1537           How Long Will I Love U        2018            101.0   
1538                     Spring Fever        2019              NaN   
1539        Burn the Stage: The Movie        2018             84.0   

                       genres     studio  domestic_gross foreign_gross  \
0     Adventure,Drama,Romance        IFC        744000.0       8000000   
1      Adve

### Creating visualizations to learn about the data