In [5]:
import pandas as pd
import glob

#Load in your csv.gz's of results for each year extracted.
file_paths = glob.glob('movies_*.csv.gz')
dfs = [pd.read_csv(file, compression='gzip') for file in file_paths]

#Concatenate the data into 1 dataframe for the remainder of the analysis.
combined_df = pd.concat(dfs, ignore_index=True)

# Exploratory Data Analysis (EDA)

#How many movies had at least some valid financial information (values > 0 for budget OR revenue)?
valid_financial_info = combined_df[(combined_df['budget'] > 0) | (combined_df['revenue'] > 0)]
num_movies_with_valid_info = len(valid_financial_info)

#Exclude any movies with 0's for budget AND revenue from the remaining visualizations.
valid_financial_info = valid_financial_info[(valid_financial_info['budget'] > 0) | (valid_financial_info['revenue'] > 0)]

#How many movies are there in each of the certification categories (G/PG/PG-13/R)?
certification_counts = valid_financial_info['mpaa_rating'].value_counts()

#What is the average revenue per certification category?
avg_revenue_by_certification = valid_financial_info.groupby('mpaa_rating')['revenue'].mean()

# What is the average budget per certification category?
avg_budget_by_certification = valid_financial_info.groupby('mpaa_rating')['budget'].mean()

# Displaying the results
print(f"Number of movies with at least some valid financial information: {num_movies_with_valid_info}")
print("\nExcluding movies with 0's for budget AND revenue:")
print(valid_financial_info.head())

print("\n Number of movies in each certification category:")
print(certification_counts)

print("\nAverage revenue per certification category:")
print(avg_revenue_by_certification)

print("\n Average budget per certification category:")
print(avg_budget_by_certification)

Number of movies with at least some valid financial information: 314

Excluding movies with 0's for budget AND revenue:
                title release_date     budget    revenue mpaa_rating  \
0               Shrek   2001-05-18   60000000  487853320          PG   
1      Monsters, Inc.   2001-11-01  115000000  579707738           G   
2   The Mummy Returns   2001-05-04   98000000  443280904       PG-13   
3    A Beautiful Mind   2001-12-14   58000000  316800000       PG-13   
4  Planet of the Apes   2001-07-25  100000000  362211740       PG-13   

                                         certificate  
0  {'adult': False, 'backdrop_path': '/sRvXNDItGl...  
1  {'adult': False, 'backdrop_path': '/vUTVUdfbsY...  
2  {'adult': False, 'backdrop_path': '/fRnhnR2Jfk...  
3  {'adult': False, 'backdrop_path': '/zbpZOYrRCx...  
4  {'adult': False, 'backdrop_path': '/rF0Y0fYVsN...  

 Number of movies in each certification category:
mpaa_rating
R        113
PG-13     91
PG        24
G         10
NR