In [1]:
import numpy as np
import pandas as pd
import json
import os

DATA_DIR = "./data"

In [2]:
# contruct dict of movie titles and overviews
movies = []
script_path = "%s/parsed_scripts/" % DATA_DIR
for movie_json in os.listdir(script_path):
    if movie_json[-5:] == ".json":
        file_path = script_path + movie_json
        data = json.load(open(file_path, "r"))
        movies.append(data)
print("loaded %d movies" % len(movies))

loaded 757 movies


In [3]:
movies[1]

{'actor_metadata': {'Aldo Maland': {'actor_id': 43, 'char_name': 'Miles'},
  'Cate Blanchett': {'actor_id': 8, 'char_name': 'Marissa Wiegler'},
  'Christian Malcolm': {'actor_id': 19, 'char_name': 'Head of Ops'},
  'Eric Bana': {'actor_id': 7, 'char_name': 'Erik Heller'},
  'Jamie Beamish': {'actor_id': 20, 'char_name': 'Burton'},
  'Jason Flemyng': {'actor_id': 24, 'char_name': 'Sebastian'},
  'Jessica Barden': {'actor_id': 21, 'char_name': 'Sophie'},
  'Joel Basman': {'actor_id': 27, 'char_name': 'Razor'},
  'John MacMillan': {'actor_id': 16, 'char_name': 'Lewis'},
  'Martin Wuttke': {'actor_id': 29, 'char_name': 'Knepfler'},
  'Michelle Dockery': {'actor_id': 25, 'char_name': 'False Marissa'},
  'Olivia Williams': {'actor_id': 23, 'char_name': 'Rachel'},
  'Paul Birchard': {'actor_id': 18, 'char_name': 'Bob'},
  'Saoirse Ronan': {'actor_id': 6, 'char_name': 'Hanna Heller'},
  'Sebastian Hülk': {'actor_id': 26, 'char_name': 'Titch'},
  'Tim Beckmann': {'actor_id': 17, 'char_name': 'W

In [4]:
# get categories
categories = set([])
for movie in movies:
    categories.update(movie["movie_metadata"]["genres"])
categories = sorted(categories)
categories_index = {cat: i for i, cat in enumerate(categories)}
categories

['action',
 'adventure',
 'animation',
 'comedy',
 'crime',
 'drama',
 'family',
 'fantasy',
 'history',
 'horror',
 'music',
 'mystery',
 'romance',
 'science fiction',
 'thriller',
 'war',
 'western']

In [5]:
# make features matrix
features = np.zeros((len(movies), len(categories) + 3))
for i, movie in enumerate(movies):
    for cat in movie["movie_metadata"]["genres"]:
        features[i, categories_index[cat]] = 1
        
    gender_dist = movie["distribution_metadata"]["gender_dist"]["by_line"]
    percent_nonmale = -1 if sum(gender_dist.values()) == 0 else 1 - gender_dist["male"]
    
    race_dist = movie["distribution_metadata"]["race_dist"]["by_line"]
    percent_nonwhite = -1 if len(race_dist) == 0 else (1 - race_dist["white"] if "white" in race_dist else 1)
    
    bechdel = 1 if movie["bechdel_metadata"]["passes"] else 0
    features[i, -3:] = np.array([percent_nonmale, percent_nonwhite, bechdel])
features

array([[ 1.  ,  0.  ,  0.  , ..., -1.  , -1.  ,  0.  ],
       [ 1.  ,  1.  ,  0.  , ...,  0.69,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ...,  0.16,  0.22,  0.  ],
       ...,
       [ 1.  ,  1.  ,  0.  , ..., -1.  , -1.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.23,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.69,  0.  ,  0.  ]])

In [6]:
# make dataframe
cols = categories + ["nonmale", "nonwhite", "bechdel"]
df = pd.DataFrame(features, columns=cols)
df['slug'] = pd.Series([movie["movie_metadata"]["slug"] for movie in movies])
df = df.set_index("slug")
df

Unnamed: 0_level_0,action,adventure,animation,comedy,crime,drama,family,fantasy,history,horror,music,mystery,romance,science fiction,thriller,war,western,nonmale,nonwhite,bechdel
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
enemy-of-the-state,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.00,-1.00,0.0
hanna,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.69,0.00,0.0
collateral-damage,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.16,0.22,0.0
shrek,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.52,0.0
shakespeare-in-love,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.24,0.00,1.0
insomnia,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.23,0.00,0.0
the-matrix-reloaded,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.34,0.97,0.0
milk,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.00,0.0
se7en,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.09,0.01,0.0
four-rooms,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.02,0.0


In [7]:
# save dataframe
df.to_csv("%s/movie_features.csv" % DATA_DIR)

# save to app/data as well
df.to_csv("../app/data/movie_features.csv")