In [7]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
#Importing main table
main_df = pd.read_csv('data/Master_Table.csv', index_col="Title_ID", usecols = ["Title_ID", "Lifetime_Gross", "Genres", "Year"])


In [3]:
#Splitting genres into separate columns
split_genre_df = main_df["Genres"].str.split(",", n=2, expand = True).add_prefix("genre_")
split_genre_df.head()

Unnamed: 0_level_0,genre_0,genre_1,genre_2
Title_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0369610,Action,Adventure,Sci-Fi
tt0401729,Action,Adventure,Sci-Fi
tt1179034,Action,Crime,Thriller
tt1194173,Action,Adventure,Thriller
tt1219289,Mystery,Sci-Fi,Thriller


In [4]:
#Merging genres back into the main table
merged_df = main_df.join(split_genre_df)
merged_df.reset_index()

Unnamed: 0,Title_ID,Year,Genres,Lifetime_Gross,genre_0,genre_1,genre_2
0,tt0369610,2015,"Action,Adventure,Sci-Fi",652270625.0,Action,Adventure,Sci-Fi
1,tt0401729,2012,"Action,Adventure,Sci-Fi",73078100.0,Action,Adventure,Sci-Fi
2,tt1179034,2010,"Action,Crime,Thriller",24077427.0,Action,Crime,Thriller
3,tt1194173,2012,"Action,Adventure,Thriller",113203870.0,Action,Adventure,Thriller
4,tt1219289,2011,"Mystery,Sci-Fi,Thriller",79249455.0,Mystery,Sci-Fi,Thriller
...,...,...,...,...,...,...,...
3183,tt7027278,2018,"Drama,Romance",901131.0,Drama,Romance,
3184,tt7089878,2018,"Animation,Drama,Fantasy",63204.0,Animation,Drama,Fantasy
3185,tt7208564,2018,"Comedy,Horror",82774.0,Comedy,Horror,
3186,tt7342204,2017,"Comedy,Drama",63239.0,Comedy,Drama,


In [5]:
#Creating dataframes for each set of genre columns
gen0_df = merged_df[["Year", "Lifetime_Gross", "genre_0"]].rename(columns = {"genre_0":"genre"})
gen1_df = merged_df[["Year", "Lifetime_Gross", "genre_1"]].rename(columns = {"genre_1":"genre"}).dropna()
gen2_df = merged_df[["Year", "Lifetime_Gross", "genre_2"]].rename(columns = {"genre_2":"genre"}).dropna()

In [6]:
#Joining the dataframes back together
genre = gen0_df.append([gen1_df, gen2_df], ignore_index = True )
gen_df = gen_df.reindex(columns=["genre", "Year", "Lifetime_Gross"])
gen_df.head()

NameError: name 'gen_df' is not defined

In [None]:
#Calculating the median for everything
median_gen_df = gen_df.groupby(["genre", "Year"]).median()

In [None]:
#Setting seaborn chart styles

sns.set_style('whitegrid')
sns.set_context("talk") 

In [None]:
#Looking at the medians for all genres
all_median_gen_df = median_gen_df.unstack(level=0)
all_median_gen_df.columns = all_median_gen_df.columns.droplevel()
all_median_gen_df.plot(figsize=(25,8))

In [None]:
#Creating line chart with top 5 genres

#Set style, font size, figure size
fig, ax = plt.subplots(figsize=(18, 7))
    
    #Format y axis for millions of dollars
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '$' + '{:,.0f}'.format(x/1000000) + 'M'))
    
    #Create graph and format labels
g = sns.lineplot(data = mean_5_gen_df,
                dashes = False,).set(xlabel = 'Year',
                ylabel = 'Gross Revenue', 
                title = 'Top 5 Genres by Median Gross Revenue')
plt.xticks(rotation=52)


In [None]:
#Creating dataframe with top 2 genres
limited_gens = gen_df[gen_df.genre.isin(["Sci-Fi", "Adventure"])]

In [None]:
#Looking at top 2 genres including confidence intervals
fig, ax = plt.subplots(figsize=(18, 7))

ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '$' + '{:,.0f}'.format(x/1000000) + 'M'))

g = sns.lineplot(data = limited_gens, hue="genre", x="Year", y = "Lifetime_Gross", palette = "Set1").set(xlabel = 'Year',
                ylabel = 'Gross Revenue', 
                title = 'Median Gross for Action and Sci-Fi Movies')

plt.xticks(rotation=52)
