In [2]:
# Import dependancies
import pandas as pd
from wordcloud import WordCloud 
import matplotlib.pyplot as plt  
import plotly.express as px
import plotly.io as pio
import ast

# Set the float format to display pure numbers
pd.options.display.float_format = '{:.2f}'.format

In [3]:
# Read the csv file into a DataFrame
df = pd.read_csv("Data/processed_data.csv")

# Make sure the DataFrame populated correctly
df.head()

Unnamed: 0,id,title,year,rating,votes,budget,gross_world_wide,gross_us_canada,opening_weekend_gross,genres,production_companies,wins,nominations,oscars
0,tt1502397,Bad Boys for Life,2020,6.5,187K,90000000.0,426505244.0,206305244.0,62504105.0,"['Buddy Cop', 'Cop Drama', 'Action', 'Comedy',...","['Columbia Pictures', '2.0 Entertainment', 'Do...",0,15,0
1,tt8332922,A Quiet Place Part II,2020,7.2,296K,61000000.0,297372261.0,160072261.0,47547231.0,"['Alien Invasion', 'Monster Horror', 'Supernat...","['Paramount Pictures', 'Platinum Dunes', 'Sund...",0,44,1
2,tt3794354,Sonic the Hedgehog,2020,6.5,171K,85000000.0,319715683.0,148974665.0,58018348.0,"['Animal Adventure', 'Buddy Comedy', 'Road Tri...","['Paramount Pictures', 'Sega Sammy Group', 'Or...",0,12,0
3,tt7713068,Birds of Prey and the Fantabulous Emancipation...,2020,6.1,271K,84500000.0,205537933.0,84172791.0,33010017.0,"['Dark Comedy', 'Superhero', 'Action', 'Comedy...","['Clubhouse Pictures (II)', 'DC Entertainment'...",0,83,0
4,tt6673612,Dolittle,2020,5.6,73K,175000000.0,251410631.0,77047065.0,21844045.0,"['Animal Adventure', 'Quest', 'Adventure', 'Co...","['Universal Pictures', 'Perfect World Pictures...",0,9,0


In [4]:
# Find the max and min for each table and display it for easy referencing
numeric_df = df[['rating', 'budget', 'gross_world_wide', 'gross_us_canada', 'opening_weekend_gross', 'wins', 'nominations', 'oscars']]

print(numeric_df.describe())


       rating          budget  gross_world_wide  gross_us_canada  \
count 4378.00         4378.00           4378.00          4378.00   
mean     6.33    179240881.95       96560581.72      38619857.35   
std      1.02   4676916620.86      207366675.95      77711990.76   
min      1.30           20.00             95.00            95.00   
25%      5.80      4639809.00        1835290.00        265761.25   
50%      6.40     16000000.00       20622447.50       7633663.00   
75%      7.00     45000000.00       90821281.25      42711708.75   
max      9.00 300000000000.00     2923706026.00     936662225.00   

       opening_weekend_gross    wins  nominations  oscars  
count                4378.00 4378.00      4378.00 4378.00  
mean             11694873.54    0.00        17.68    0.17  
std              24605966.57    0.00        39.26    0.67  
min                    11.00    0.00         0.00    0.00  
25%                 37301.50    0.00         0.00    0.00  
50%               1165404.5

In [5]:
most_nominations = df[df['nominations'] == 433]
most_nominations

Unnamed: 0,id,title,year,rating,votes,budget,gross_world_wide,gross_us_canada,opening_weekend_gross,genres,production_companies,wins,nominations,oscars
2780,tt1517268,Barbie,2023,6.8,585K,100000000.0,1446938421.0,636238421.0,162022044.0,"['High-Concept Comedy', 'Quirky Comedy', 'Sati...","['Warner Bros.', 'Heyday Films', 'LuckyChap']",0,433,0


In [6]:
most_oscars = df[df['oscars'] == 10]
most_oscars

Unnamed: 0,id,title,year,rating,votes,budget,gross_world_wide,gross_us_canada,opening_weekend_gross,genres,production_companies,wins,nominations,oscars
1301,tt1403865,True Grit,2010,7.6,364K,38000000.0,252278285.0,171243005.0,24830443.0,"['Period Drama', 'Drama', 'Western']","['Paramount Pictures', 'Skydance Media', 'Scot...",0,169,10
2576,tt1800241,American Hustle,2013,7.2,507K,40000000.0,251171807.0,150117807.0,740455.0,"['Period Drama', 'True Crime', 'Crime', 'Drama']","['Columbia Pictures', 'Annapurna Pictures', 'A...",0,227,10
2812,tt5537002,Killers of the Flower Moon,2023,7.6,270K,200000000.0,158764012.0,68026901.0,23253655.0,"['Epic', 'Period Drama', 'Tragedy', 'True Crim...","['Apple Studios', 'Imperative Entertainment', ...",0,425,10


In [7]:
fig = px.treemap(df,
                 path=['title', 'genres'],
                 values= 'gross_world_wide',
                 color= 'gross_world_wide',
                 color_continuous_scale='Greens',
                 title= 'Highest Earning Movies Treemap')

fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))

fig.show()

In [8]:
top_50 = df.nlargest(50, 'gross_world_wide')

fig = px.treemap(top_50,
                 path=['title', 'genres'],
                 values= 'gross_world_wide',
                 color= 'gross_world_wide',
                 color_continuous_scale='Greens',
                 title= 'Highest Earning Movies Treemap')

fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))

fig.show()


In [9]:
# Initialize an empty list to store unique genres
all_genres_unique = []

# Loop through each row of the 'Genres' column
for index, row in df.iterrows():
    # Check if 'Genres' is a list
    if isinstance(row['genres'], list):
        genres = row['genres']
    else:
        # Clean up the string if it's not a list (remove square brackets and extra quotes)
        genres = [genre.strip().strip("'").strip("[]") for genre in row['genres'].replace('[', '').replace(']', '').split(',')]

    # Loop through each genre in the current movie's genre list
    for genre in genres:
        # Check if the genre is already in the all_genres list
        if genre not in all_genres_unique:
            # If not, append the genre to the list
            all_genres_unique.append(genre)

# Output the final list of unique genres
print(all_genres_unique)

['Buddy Cop', 'Cop Drama', 'Action', 'Comedy', 'Crime', 'Thriller', 'Alien Invasion', 'Monster Horror', 'Supernatural Horror', 'Drama', 'Horror', 'Sci-Fi', 'Animal Adventure', 'Buddy Comedy', 'Road Trip', 'Slapstick', 'Superhero', 'Supernatural Fantasy', 'Adventure', 'Family', 'Dark Comedy', 'Quest', 'Fantasy', 'Psychological Horror', 'Psychological Thriller', 'Suspense Mystery', 'Mystery', 'Adventure Epic', 'Mountain Adventure', 'Survival', 'Computer Animation', 'Epic', 'Fantasy Epic', 'Sword & Sorcery', 'Animation', 'Time Travel', 'Adult Animation', 'Anime', 'Dark Fantasy', 'Shōnen', 'Holiday', 'Teen Horror', 'Car Action', 'Romance', 'Disaster', 'Sea Adventure', 'Fairy Tale', 'Folk Horror', 'Witch Horror', 'Western', 'Coming-of-Age', 'Costume Drama', 'Feel-Good Romance', 'Period Drama', 'Romantic Comedy', 'Biography', 'Music', 'Body Swap Comedy', 'Slasher Horror', 'Teen Comedy', 'One-Person Army Action', 'Tragedy', 'Docudrama', 'Psychological Drama', 'Zombie Horror', 'Dystopian Sci-F

In [12]:
# Initialize an empty list to store all genres
all_genres_whole = []

# Loop through each row of the 'Genres' column
for index, row in df.iterrows():
    # Check if 'Genres' is a list
    if isinstance(row['genres'], list):
        genres = row['genres']
    else:
        # Clean up the string if it's not a list (remove square brackets and extra quotes)
        genres = [genre.strip().strip("'").strip("[]") for genre in row['genres'].replace('[', '').replace(']', '').split(',')]

    # Add genres to the all_genres list
    all_genres_whole.extend(genres)

# Convert all_genres list to a pandas Series and count the occurrences of each genre
genre_counts = pd.Series(all_genres_whole).value_counts()

# Get the top 20 genres
top_10_genres = genre_counts.head(20)

# Output the top 20 genres
print(top_10_genres)

Drama           2518
Comedy          1533
Thriller        1216
Action          1051
Romance          910
Adventure        813
Crime            701
Mystery          576
Fantasy          559
Horror           526
Sci-Fi           486
Family           455
Biography        394
History          282
Dark Comedy      254
Animation        248
Period Drama     200
War              197
Documentary      185
Music            178
Name: count, dtype: int64


In [11]:
genres_count = len(all_genres_unique)
print(genres_count)

180


In [15]:
# Initialize an empty list to store all genres
all_companies_whole = []

# Loop through each row of the 'Genres' column
for index, row in df.iterrows():
    # Check if 'Genres' is a list
    if isinstance(row['production_companies'], list):
        companies = row['production_companies']
    else:
        # Clean up the string if it's not a list (remove square brackets and extra quotes)
        companies = [genre.strip().strip("'").strip("[]") for genre in row['production_companies'].replace('[', '').replace(']', '').split(',')]

    # Add genres to the all_genres list
    all_companies_whole.extend(companies)

# Convert all_genres list to a pandas Series and count the occurrences of each genre
companies_counts = pd.Series(all_companies_whole).value_counts()

# Get the top 20 genres
top_10_companies = companies_counts.head(20)

# Output the top 20 genres
print(top_10_companies)

Universal Pictures           204
Warner Bros.                 197
Columbia Pictures            178
Paramount Pictures           146
Walt Disney Pictures         102
Relativity Media             102
New Line Cinema               99
Twentieth Century Fox         99
Lionsgate                     94
Metro-Goldwyn-Mayer (MGM)     77
Summit Entertainment          70
New Regency Productions       60
Screen Gems                   59
StudioCanal                   57
Dreamworks Pictures           56
Focus Features                55
Blumhouse Productions         52
Legendary Entertainment       52
Village Roadshow Pictures     51
Film4                         45
Name: count, dtype: int64
