In [None]:
# load Data/movie.metadata.tsv
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DATA_PATH = 'Data/'
def load_movie_df():
    '''Load the movie metadata from the CoreNLP files'''
    def strip_encoding(x): return np.nan if x == '{}' else [w.replace(
        ' Language', '').replace(' language', '') for w in re.findall(r'"(.*?)"', x)[1::2]]

    movie_path = DATA_PATH + 'movie.metadata.tsv'
    movie_cols = ['Wikipedia ID', 'Freebase ID', 'Name', 'Release date',
                  'Box office revenue', 'Runtime', 'Languages', 'Countries', 'Genres']
    movie_df = pd.read_csv(movie_path, sep='\t', header=None,
                           names=movie_cols, index_col=False, dtype={'Freebase ID': str})
    movie_df['Languages'] = movie_df['Languages'].apply(strip_encoding)
    movie_df['Countries'] = movie_df['Countries'].apply(strip_encoding)
    movie_df['Genres'] = movie_df['Genres'].apply(strip_encoding)
    return movie_df

In [None]:
movie_df = load_movie_df()

In [None]:
# Get relaase date by decade
movie_df = movie_df[~movie_df['Release date'].isna()]
movie_df['Release date year'] = movie_df['Release date'].apply(
    lambda x: int(str(x)[:4]))
movie_df['Release date decade'] = movie_df['Release date year'].apply(
    lambda x: int(x/10)*10)
movie_df['Release date decade'].value_counts()


In [None]:
#romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
#romance_genres = ['Drama', 'Comedy', 'Romance Film', 'Black-and-white', 'Action', 'Thriller', 'Short Film', 'World cinema', 'Crime Fiction', 'Indie']
# Take the top 20 genres and look at their evolution through time
romance_genres = ['Drama',
                  'Comedy',
                  'Romance Film',
                  'Black-and-white',
                  'Action',
                  'Thriller',
                  'Short Film',
                  'World cinema',
                  'Crime Fiction',
                  'Indie',
                  'Documentary',
                  'Horror',
                  'Silent film',
                  'Adventure',
                  'Family Film',
                  'Action/Adventure',
                  'Comedy film',
                  'Musical',
                  'Animation',
                  'Romantic drama']


def is_romantic(i): return lambda x: any(
    y in romance_genres[i] for y in x) if type(x) == list else False


romance_movies = movie_df[movie_df['Genres'].apply(is_romantic(slice(0, 5)))]
romance_movies = romance_movies[~romance_movies['Release date'].isna()]

# For romance movies, plotly the proportion of romantic genres per decade with a slider
# Create a dictionary with the number of movies per decade per genre
genre_counts = {}
for year in range(1880, 2021, 10):
    genre_counts[year] = {}
    for genre in romance_genres:
        genre_counts[year][genre] = romance_movies[romance_movies['Release date'].apply(
            lambda x: year <= int(str(x)[:4]) < (year+10)) & romance_movies['Genres'].apply(lambda x: (genre in x))]

genre_counts_prop = {}
for year in range(1880, 2020, 10):
    genre_counts_prop[year] = {}
    for genre in genre_counts[year].keys():
        genre_counts_prop[year][genre] = len(genre_counts[year][genre])


# Create a dataframe with the proportion of movies per year per genre
genre_counts_prop_df = pd.DataFrame(genre_counts_prop)
genre_counts_prop_df = genre_counts_prop_df.reset_index()
genre_counts_prop_df = genre_counts_prop_df.rename(columns={'index': 'Genre'})
genre_counts_prop_df = genre_counts_prop_df.melt(
    id_vars=['Genre'], var_name='Decade', value_name='Number of movies')
genre_counts_prop_df['Proportion of movies'] = genre_counts_prop_df.apply(
    lambda x: x['Number of movies'] / movie_df['Release date decade'].value_counts()[x['Decade']], axis=1)
genre_counts_prop_df['Proportion of movies in percentage'] = genre_counts_prop_df['Proportion of movies'].apply(
    lambda x: round(x*100, 2))
genre_counts_prop_df['Number of movies'] = genre_counts_prop_df['Number of movies'].apply(
    lambda x: "Nbr movies: " + str(x))


In [None]:
import plotly.express as px

fig = px.bar(genre_counts_prop_df, x="Genre", y="Proportion of movies in percentage", animation_frame="Decade",
             animation_group="Genre", color="Genre", hover_name="Number of movies", range_y=[0, 80])

fig["layout"].pop("updatemenus")  # optional, drop animation buttons
# move the animation button a bit up
fig.update_layout(updatemenus=[dict(type="buttons", x=-0.1, y=1, xanchor="left", yanchor="bottom")])
fig.update_layout(showlegend=False)
# Add horizontal raster lines



fig.update_layout(
    title="Distribution of movies genres across time",
    yaxis_title="Proportion of movies",
    xaxis_title=""
)
# control the speed of the animation
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
           
            buttons=[
                dict(
                    args=[None, {"frame": {"duration": 500, "redraw": False},
                                 "fromcurrent": True, "transition": {"duration": 400, "easing": "quadratic-in-out"}}],
                    label="Play",
                    method="animate"
                    # white 
                ),
                dict(
                    args=[[None], {"frame": {"duration": 0, "redraw": False},
                                   "mode": "immediate",
                                   "transition": {"duration": 0}}],
                    label="Pause",
                    method="animate"
                )
            ]
        )
    ]
)
fig.update_layout(
    title={
        'text': "Distribution of movies genres across time",
        'y': 0.98,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'})


# remove x axis title
fig.update_xaxes(title_text='')
fig.write_html("Plots/genre_distrib.html")
fig.show()


## Wordclouds

In [None]:
final_df = pd.read_csv('Data/final_df.csv', sep='\t')
import wordcloud

In [None]:
final_df = final_df[~final_df['filtered_descriptions'].isna()]
# from column descriptions, remove brackets, apostrophes and then split by comma to get a list. Execute on column descriptions
final_df['filtered_descriptions'] = final_df['filtered_descriptions'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').split(', '))
# Remove duplicates of Freebase character ID of final_df
final_df = final_df.drop_duplicates(subset=['Freebase character ID'])
# Get a dictionary with as key the label number and as value a pandas dataframe of characters with that label, and only the descriptions


# Create a dictionary with as key the label number and as value a pandas dataframe of characters with that label, and only the descriptions
label_dict = {}
for label in final_df['labels'].unique():
    # Get a list of all the descriptions of the characters with that label
    label_dict[label] = final_df[final_df['labels']
                                 == label]['filtered_descriptions'].tolist()
label_dict = {k: [item for sublist in v for item in sublist]
              for k, v in label_dict.items()}


In [None]:
# Get top 10 most frequent words for each label
top_10_words = {}
for label in label_dict_titles_attributes.keys():
    top_10_words[label] = pd.Series(
        ' '.join(label_dict[label]).split()).value_counts()[:10]

# Get a list of all the words that are in the top 10 words of any label
top_10_words_list = []
for label in top_10_words.keys():
    top_10_words_list.extend(top_10_words[label].index)

# If a word is in the top 10 words for more than 7 labels, remove it from each list
for word in top_10_words_list:
    if sum([word in top_10_words[label].index for label in top_10_words.keys()]) > 3:
        print(word)
        for label in top_10_words.keys():
            # Drop the word from the list of words in label_dict
            label_dict_titles_attributes[label] = [x for x in label_dict_titles_attributes[label] if x != word]

In [None]:
# Create the same dictionary, but now with titles instead of descriptions
# drop na
final_df = final_df[~final_df['title'].isna()]
# From column title, remove apostrophes and then split by comma to get a list. Execute on column title
final_df['title'] = final_df['title'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').split(', '))
label_dict_titles = {}
for label in final_df['labels'].unique():
    # Get a list of all the descriptions of the characters with that label
    label_dict_titles[label] = final_df[final_df['labels']
                                        == label]['title'].tolist()
label_dict_titles = {k: [item for sublist in v for item in sublist]
                        for k, v in label_dict_titles.items()}


In [None]:
# Create the same dictionary, but now for attributes
# drop na
final_df = final_df[~final_df['attributes'].isna()]
# From column title, remove apostrophes and then split by comma to get a list. Execute on column title
final_df['attributes'] = final_df['attributes'].apply(
    lambda x: x.replace('[', '').replace(']', '').replace("'", '').split(', '))
label_dict_attributes = {}
for label in final_df['labels'].unique():
    # Get a list of all the descriptions of the characters with that label
    label_dict_attributes[label] = final_df[final_df['labels']
                                        == label]['attributes'].tolist()
label_dict_attributes = {k: [item for sublist in v for item in sublist]
                        for k, v in label_dict_attributes.items()}


In [None]:
# For each key, combine the two values of label_dict_titles and label_dict_attributes. Each value is a list, thus append the two lists
label_dict_titles_attributes = {}
for key in label_dict_titles.keys():
    label_dict_titles_attributes[key] = label_dict_titles[key] + label_dict_attributes[key]

In [None]:
wordclouds = {}
# For each key in the dictionary, create a wordcloud. Save the wordcloud in a dictionary with the key as key
for key in label_dict.keys():
    wordclouds[key] = wordcloud.WordCloud(width=800, height=400, background_color="white").generate(
        ' '.join(label_dict[key]))
        

In [None]:
wordclouds = {}
# For each key in the dictionary, create a wordcloud. Save the wordcloud in a dictionary with the key as key
for key in label_dict.keys():
    wordclouds[key] = wordcloud.WordCloud(width=800, height=400, background_color="white").generate(
        ' '.join(label_dict_titles_attributes[key]))


In [None]:
# for all the wordclouds, plot them and save them
for i in wordclouds.keys():
    # plot the WordCloud image
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordclouds[i])
    plt.axis("off")
    plt.tight_layout(pad=0)
    # Save as jpg to folder Plots/wordclouds/wordcloud_label_{}.jpg
    plt.savefig('Plots/Wordcloud/wordcloud_label_{}.jpg'.format(i))


In [None]:
# print 20 