### Genre Exploration

In [3]:
# dependencies
import pandas as pd
from pathlib import Path
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.express as px

In [4]:
#import csv's
data_parq = Path("Resources/mo_movies_data.parquet")

data_df = pd.read_parquet(data_parq)

In [5]:
data_df

Unnamed: 0,Title,Year,IMDB Rating,IMDB Votes,Genre,Box Office,IMDB ID,Budget,Revenue
0,Camille Claudel 1915,2013,6.5,3889.0,"Biography, Drama","$35,296",tt2018086,3512454,115860.0
1,A Sound of Thunder,2005,4.2,20549.0,"Action, Adventure, Horror","$1,900,451",tt0318081,80000000,5989640.0
2,The Jacket,2005,7.1,119641.0,"Drama, Fantasy, Mystery","$6,303,762",tt0366627,29000000,21126225.0
3,The Interpreter,2005,6.4,111280.0,"Crime, Mystery, Thriller","$72,708,161",tt0373926,80000000,162944923.0
4,The Ring Two,2005,5.4,101457.0,"Horror, Mystery","$76,231,249",tt0377109,50000000,161451538.0
...,...,...,...,...,...,...,...,...,...
2311,FC Venus,2005,5.5,2325.0,"Comedy, Romance, Sport",,tt0453365,2196531,2411594.0
2312,Sivaji,2007,7.5,21484.0,"Action, Crime, Drama",,tt0479751,12000000,19000000.0
2313,Dikari,2006,6.5,693.0,Comedy,,tt0933361,800000,1328612.0
2314,Pro lyuboff,2010,5.8,297.0,Drama,,tt1718881,2000000,1268793.0


In [6]:
data_df["ROI$"] = (data_df['Revenue'] - data_df['Budget'])
data_df

Unnamed: 0,Title,Year,IMDB Rating,IMDB Votes,Genre,Box Office,IMDB ID,Budget,Revenue,ROI$
0,Camille Claudel 1915,2013,6.5,3889.0,"Biography, Drama","$35,296",tt2018086,3512454,115860.0,-3396594.0
1,A Sound of Thunder,2005,4.2,20549.0,"Action, Adventure, Horror","$1,900,451",tt0318081,80000000,5989640.0,-74010360.0
2,The Jacket,2005,7.1,119641.0,"Drama, Fantasy, Mystery","$6,303,762",tt0366627,29000000,21126225.0,-7873775.0
3,The Interpreter,2005,6.4,111280.0,"Crime, Mystery, Thriller","$72,708,161",tt0373926,80000000,162944923.0,82944923.0
4,The Ring Two,2005,5.4,101457.0,"Horror, Mystery","$76,231,249",tt0377109,50000000,161451538.0,111451538.0
...,...,...,...,...,...,...,...,...,...,...
2311,FC Venus,2005,5.5,2325.0,"Comedy, Romance, Sport",,tt0453365,2196531,2411594.0,215063.0
2312,Sivaji,2007,7.5,21484.0,"Action, Crime, Drama",,tt0479751,12000000,19000000.0,7000000.0
2313,Dikari,2006,6.5,693.0,Comedy,,tt0933361,800000,1328612.0,528612.0
2314,Pro lyuboff,2010,5.8,297.0,Drama,,tt1718881,2000000,1268793.0,-731207.0


In [7]:
# set display to 2 decimal instead of scientific notation
pd.set_option('display.float_format', '{:.2f}'.format)

In [8]:
# here's the entire list of genres to pull from:
unique = list(data_df["Genre"].unique())

genre_set = set()
for genres in unique:
    split = genres.replace(" ", "").split(",")
    genre_set.update(split)

#len(genre_set)

genre_count = {k:[data_df[data_df["Genre"].str.contains(k)]["Title"].count(),\
                 data_df[data_df["Genre"].str.contains(k)]["Revenue"].sum(),\
                 data_df[data_df["Genre"].str.contains(k)]["Budget"].sum(),\
                 data_df[data_df["Genre"].str.contains(k)]["ROI$"].sum()] for k in genre_set}
genre_count_df = pd.DataFrame(genre_count).T.rename(columns ={0:"Genre Count", 1:"Total Revenue", 2:"Total Budget", 3:"Return on Investment"})
genre_count_df

Unnamed: 0,Genre Count,Total Revenue,Total Budget,Return on Investment
Fantasy,185.0,33605612095.0,11211596621.0,22394015474.0
Crime,427.0,26063419086.0,11844484650.0,14218934436.0
Horror,224.0,13196164590.0,3998989335.0,9197175255.0
Adventure,486.0,129820038026.0,41519950639.0,88300087387.0
Sci-Fi,191.0,43902239574.0,14032010288.0,29870229286.0
Mystery,249.0,17035741487.0,6239863837.0,10795877650.0
Romance,395.0,23817499004.0,9156074714.0,14661424290.0
Action,675.0,116701735028.0,42435936638.0,74265798390.0
Musical,20.0,1908860269.0,644240000.0,1264620269.0
Music,85.0,5129804117.0,1835934442.0,3293869675.0


In [9]:
genre_total_revenue = {k:data_df[data_df["Genre"].str.contains(k)]["Revenue"].sum() for k in genre_set}
genre_total_revenue

{'Fantasy': 33605612095.0,
 'Crime': 26063419086.0,
 'Horror': 13196164590.0,
 'Adventure': 129820038026.0,
 'Sci-Fi': 43902239574.0,
 'Mystery': 17035741487.0,
 'Romance': 23817499004.0,
 'Action': 116701735028.0,
 'Musical': 1908860269.0,
 'Music': 5129804117.0,
 'Comedy': 83943140339.0,
 'Biography': 10596635741.0,
 'Animation': 38341964609.0,
 'War': 1729194863.0,
 'Thriller': 35815338529.0,
 'Documentary': 785686966.0,
 'Drama': 73957193133.0,
 'Family': 22498973591.0,
 'Sport': 2279436935.0,
 'History': 4121842213.0,
 'Western': 710558240.0}

In [10]:
genre_budget = {k:data_df[data_df["Genre"].str.contains(k)]["Budget"].sum() for k in genre_set}
genre_budget

{'Fantasy': 11211596621,
 'Crime': 11844484650,
 'Horror': 3998989335,
 'Adventure': 41519950639,
 'Sci-Fi': 14032010288,
 'Mystery': 6239863837,
 'Romance': 9156074714,
 'Action': 42435936638,
 'Musical': 644240000,
 'Music': 1835934442,
 'Comedy': 28711334545,
 'Biography': 4228203293,
 'Animation': 11561227737,
 'War': 1085900000,
 'Thriller': 12263095242,
 'Documentary': 165168148,
 'Drama': 30514492261,
 'Family': 8080635400,
 'Sport': 1124427704,
 'History': 2144011888,
 'Western': 191730000}

In [19]:
genre_roi = {k:data_df[data_df["Genre"].str.contains(k)]["ROI$"].sum() for k in genre_set}
genre_roi

{'Fantasy': 22394015474.0,
 'Crime': 14218934436.0,
 'Horror': 9197175255.0,
 'Adventure': 88300087387.0,
 'Sci-Fi': 29870229286.0,
 'Mystery': 10795877650.0,
 'Romance': 14661424290.0,
 'Action': 74265798390.0,
 'Musical': 1264620269.0,
 'Music': 3293869675.0,
 'Comedy': 55231805794.0,
 'Biography': 6368432448.0,
 'Animation': 26780736872.0,
 'War': 643294863.0,
 'Thriller': 23552243287.0,
 'Documentary': 620518818.0,
 'Drama': 43442700872.0,
 'Family': 14418338191.0,
 'Sport': 1155009231.0,
 'History': 1977830325.0,
 'Western': 518828240.0}

Code and notes

In [None]:
# # use str.contains() function to get genres - create action genre df
# crime_movies = data_df[data_df["Genre"].str.contains("Crime")]
# action_movies = data_df[data_df["Genre"].str.contains("Action")]
# comedy_movies = data_df[data_df["Genre"].str.contains("Comedy")]
# adventure_movies = data_df[data_df["Genre"].str.contains("Adventure")]
# thriller_movies = data_df[data_df["Genre"].str.contains("Thriller")]
# romance_movies = data_df[data_df["Genre"].str.contains("Romance")]
# drama_movies = data_df[data_df["Genre"].str.contains("Drama")]
# scifi_movies = data_df[data_df["Genre"].str.contains("Sci-Fi")]
# animation_movies = data_df[data_df["Genre"].str.contains("Animation")]
# horror_movies = data_df[data_df["Genre"].str.contains("Horror")]
# fantasy_movies = data_df[data_df["Genre"].str.contains("Fantasy")]
# documentary_movies = data_df[data_df["Genre"].str.contains("Documentary")]
# western_movies = data_df[data_df["Genre"].str.contains("Western")]

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(data_df['Genre'], data_df['Revenue'])
plt.xlabel('Genre')
plt.ylabel('Revenue ($ million)')
plt.title('Revenue by Genre')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(horror_movies['Genre'], horror_movies['Revenue'])
plt.xlabel('Horror Genre')
plt.ylabel('Revenue ($ million)')
plt.title('Revenue by Horror Genre')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(comedy_movies['Genre'], comedy_movies['Revenue'])
plt.xlabel('Comedy Genre')
plt.ylabel('Revenue ($ million)')
plt.title('Revenue by Comedy Genre')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(scifi_movies['Genre'], scifi_movies['Revenue'])
plt.xlabel('Sci-Fi Genre')
plt.ylabel('Revenue ($ million)')
plt.title('Revenue by Sci-Fi Genre')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(drama_movies['Genre'], drama_movies['Revenue'])
plt.xlabel('Dram Genre')
plt.ylabel('Revenue ($ million)')
plt.title('Revenue by Drama Genre')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Below, I tried to use a for loop for creating each genre df for us. It was annoying and complicated,
# then i found the str.contains() function with is very simple. So I'd recommend using that to create genre df's for 
# each genre you want to explore

In [None]:
# # use loop for creating genres dictionary?
# genre_df = {}

# # Create a DataFrame for each year
# for genre in data_df['Genre']:
#     genre_df = data_df[data_df['Genre'].isin([genre])]
#     genre_dataframes[genre] = genre_df

# # Access the DataFrame for a specific year
# genre_action_df = genre_dataframes.get(Action)

In [None]:
# https://stackoverflow.com/questions/19169649/how-to-use-str-contains-with-multiple-expressions-in-pandas-dataframes

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html#pandas.DataFrame.isin

In [None]:
# https://www.geeksforgeeks.org/difference-between-list-vs-set-vs-tuple-in-python/

In [None]:
# tried using is but had errors. str.contains() worked better
# action_df = data_df[data_df["Action"].isin["Genre"]]
# action_df