In [1]:
# load Data/movie.metadata.tsv
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = 'Data/'
def load_movie_df():
    '''Load the movie metadata from the CoreNLP files'''
    def strip_encoding(x): return np.nan if x == '{}' else [w.replace(
        ' Language', '').replace(' language', '') for w in re.findall(r'"(.*?)"', x)[1::2]]

    movie_path = DATA_PATH + 'movie.metadata.tsv'
    movie_cols = ['Wikipedia ID', 'Freebase ID', 'Name', 'Release date',
                  'Box office revenue', 'Runtime', 'Languages', 'Countries', 'Genres']
    movie_df = pd.read_csv(movie_path, sep='\t', header=None,
                           names=movie_cols, index_col=False, dtype={'Freebase ID': str})
    movie_df['Languages'] = movie_df['Languages'].apply(strip_encoding)
    movie_df['Countries'] = movie_df['Countries'].apply(strip_encoding)
    movie_df['Genres'] = movie_df['Genres'].apply(strip_encoding)
    return movie_df

In [3]:
movie_df = load_movie_df()

In [4]:
# Get relaase date by decade
movie_df = movie_df[~movie_df['Release date'].isna()]
movie_df['Release date year'] = movie_df['Release date'].apply(
    lambda x: int(str(x)[:4]))
movie_df['Release date decade'] = movie_df['Release date year'].apply(
    lambda x: int(x/10)*10)
movie_df['Release date decade'].value_counts()


2000    19268
1990     9469
1980     7397
1970     6716
1960     5875
1950     5723
2010     5228
1930     4800
1940     4466
1920     2883
1910     2669
1900      193
1890      149
1880        2
1010        1
Name: Release date decade, dtype: int64

In [5]:
#romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
#romance_genres = ['Drama', 'Comedy', 'Romance Film', 'Black-and-white', 'Action', 'Thriller', 'Short Film', 'World cinema', 'Crime Fiction', 'Indie']
# Take the top 20 genres and look at their evolution through time
romance_genres = ['Drama',
                  'Comedy',
                  'Romance Film',
                  'Black-and-white',
                  'Action',
                  'Thriller',
                  'Short Film',
                  'World cinema',
                  'Crime Fiction',
                  'Indie',
                  'Documentary',
                  'Horror',
                  'Silent film',
                  'Adventure',
                  'Family Film',
                  'Action/Adventure',
                  'Comedy film',
                  'Musical',
                  'Animation',
                  'Romantic drama']


def is_romantic(i): return lambda x: any(
    y in romance_genres[i] for y in x) if type(x) == list else False


romance_movies = movie_df[movie_df['Genres'].apply(is_romantic(slice(0, 5)))]
romance_movies = romance_movies[~romance_movies['Release date'].isna()]

# For romance movies, plotly the proportion of romantic genres per decade with a slider
# Create a dictionary with the number of movies per decade per genre
genre_counts = {}
for year in range(1880, 2021, 10):
    genre_counts[year] = {}
    for genre in romance_genres:
        genre_counts[year][genre] = romance_movies[romance_movies['Release date'].apply(
            lambda x: year <= int(str(x)[:4]) < (year+10)) & romance_movies['Genres'].apply(lambda x: (genre in x))]

genre_counts_prop = {}
for year in range(1880, 2020, 10):
    genre_counts_prop[year] = {}
    for genre in genre_counts[year].keys():
        genre_counts_prop[year][genre] = len(genre_counts[year][genre])


# Create a dataframe with the proportion of movies per year per genre
genre_counts_prop_df = pd.DataFrame(genre_counts_prop)
genre_counts_prop_df = genre_counts_prop_df.reset_index()
genre_counts_prop_df = genre_counts_prop_df.rename(columns={'index': 'Genre'})
genre_counts_prop_df = genre_counts_prop_df.melt(
    id_vars=['Genre'], var_name='Decade', value_name='Number of movies')
genre_counts_prop_df['Proportion of movies'] = genre_counts_prop_df.apply(
    lambda x: x['Number of movies'] / movie_df['Release date decade'].value_counts()[x['Decade']], axis=1)
genre_counts_prop_df['Proportion of movies in percentage'] = genre_counts_prop_df['Proportion of movies'].apply(
    lambda x: round(x*100, 2))
genre_counts_prop_df['Number of movies'] = genre_counts_prop_df['Number of movies'].apply(
    lambda x: "Nbr movies: " + str(x))


In [34]:
import plotly.express as px

fig = px.bar(genre_counts_prop_df, x="Genre", y="Proportion of movies in percentage", animation_frame="Decade",
             animation_group="Genre", color="Genre", hover_name="Number of movies", range_y=[0, 80])

fig["layout"].pop("updatemenus")  # optional, drop animation buttons
# move the animation button a bit up
fig.update_layout(updatemenus=[dict(type="buttons", x=-0.1, y=1, xanchor="left", yanchor="bottom")])
fig.update_layout(showlegend=False)
# center the title

fig.update_layout(
    title="Distribution of movies genres across time",
    yaxis_title="Proportion of movies",
    xaxis_title=""
)
# control the speed of the animation
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            font=dict(color="black"),
            bgcolor="grey",
            buttons=[
                dict(
                    args=[None, {"frame": {"duration": 500, "redraw": False},
                                 "fromcurrent": True, "transition": {"duration": 400, "easing": "quadratic-in-out"}}],
                    label="Play",
                    method="animate"
                    # white 
                ),
                dict(
                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                   "mode": "immediate",
                                   "transition": {"duration": 0}}],
                    label="Pause",
                    method="animate"
                )
            ]
        )
    ]
)
fig.update_layout(
    title={
        'text': "Distribution of movies genres across time",
        'y': 0.98,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

# Make it suitable for a site with a black background
fig.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white'
)
# remove x axis title
fig.update_xaxes(title_text='')
fig.write_html("Plots/genre_distrib.html")
fig.show()
