# Pandas CSV file exploration and cleaning

In [1]:
# Importing all libraries
import warnings
import pandas as pd 
import sqlite3

#Ignoring ugly warnings
warnings.filterwarnings("ignore")

In [2]:
# Creating Multiple DataFrames to explore and clean each individual file
df_gross = pd.read_csv('../zippedData/bom.movie_gross.csv.gz')
df_budgets = pd.read_csv('../zippedData/tn.movie_budgets.csv.gz')

In [3]:
# Checking info
df_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [4]:
# Checking info
df_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


df_budgets and df_gross are the only CSV files I believe to be useful, the others either have irrelvant data to our EDA or are already in our SQL database.

Time to dive deeper into the data we have in our CSV files so we can understand it better so that We can prepare the data to make business recomendations

In [10]:
print(df_budgets.dtypes)

id                    int64
release_date         object
movie                object
production_budget     int32
domestic_gross        int32
worldwide_gross      object
dtype: object


In [12]:
df_budgets.columns

Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [6]:
# Converting to interger and date time to be used for comparision and visualizations
df_budgets['production_budget'] = df_budgets['production_budget'].str.replace('[$,]', '', regex=True).astype(int)
df_budgets['domestic_gross'] = df_budgets['domestic_gross'].str.replace('[$,]', '', regex=True).astype(int)
df_budgets['worldwide_gross'] = df_budgets['worldwide_gross'].str.replace('[$,]', '', regex=True).astype(np.int64)
df_budgets['release_date'] = pd.to_datetime(df_budgets['release_date'], format='%b %d, %Y')

AttributeError: Can only use .str accessor with string values!

In [19]:
# Calculate ROI and add it as a new column in the original DataFrame
df_budgets['ROI'] = (df_budgets['worldwide_gross'].str.replace('[$,]', '', regex=True).astype(float) - df_budgets['production_budget']) / df_budgets['production_budget']

# Display the updated DataFrame with the calculated 'ROI' column
df_budgets


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,ROI
0,1,"Dec 18, 2009",Avatar,425000000,760507625,"$2,776,345,279",5.532577
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,"$1,045,663,875",1.546673
2,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,"$149,762,350",-0.572108
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,"$1,403,013,963",3.243841
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,"$1,316,721,747",3.153696
...,...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,7000,0,$0,-1.000000
5778,79,"Apr 2, 1999",Following,6000,48482,"$240,495",39.082500
5779,80,"Jul 13, 2005",Return to the Land of Wonders,5000,1338,"$1,338",-0.732400
5780,81,"Sep 29, 2015",A Plague So Pleasant,1400,0,$0,-1.000000


In [14]:
df_gross.columns


Index(['title', 'studio', 'domestic_gross', 'foreign_gross', 'year'], dtype='object')

In [None]:
# Converting gross
df_gross['foreign_gross'] = df_gross['foreign_gross'].str.replace('[$,]', '', regex=True).astype(float)
df_gross

In [None]:
df_gross['release_day_num'] = df_gross['release_date'].apply(lambda x:x.day)
df_gross['release_month_num'] = df_gross['release_date'].apply(lambda x:x.month)
df_gross['release_day'] = df_gross['release_date'].dt.day_name(
df_gross['release_month'] = df_gross['release_date'].dt.month_name()

In [None]:
df_gross.columns

# SQL Database exploration and cleaning

In [None]:
# Printing the Schema of the Database
conn = sqlite3.connect('../zippedData/im.db')
pd.read_sql("""
    SELECT name
    FROM sqlite_master
    WHERE type == 'table'
""", conn)

In [None]:
# We should get the information regarding each table so we can see what were working with
# Retrieve the list of tables in the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Looping through the tables and get info for each table
for table in tables:
    table_name = table[0]
    
    # Query the table and create a DataFrame
    query = f"SELECT * FROM {table_name};"
    df = pd.read_sql_query(query, conn)
    
    # Display info for the DataFrame
    print(f"Table Name: {table_name}")
    print(df.info())
    print('\n')

Most of these tables are pretty straight foward however it would be good to look into anything that isnt as clear before we make any major changes to our database

In [None]:
pd.read_sql("""
    SELECT *
    FROM movie_akas
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM principals
""", conn)

In [None]:
pd.read_sql("""
    SELECT *
    FROM known_for
""", conn)

Some of these tables and columns are irrelevant to us, We should drop these. Theres no reason to take up more space than necessary and prolong any potential load times.

In [None]:
#Dropping unnecessary columns and tables
cursor.execute("DROP TABLE movie_akas")
cursor.execute("ALTER TABLE persons DROP COLUMN death_year;")
cursor.execute("DROP TABLE principals")

In [None]:
# Close Connection and Save CSV file
conn.close()

In [21]:
df_gross.to_csv('../zippedData/gross_movie_cleaned.csv')
df_budgets.to_csv('../zippedData/movie_budget_cleaned.csv')