In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import urllib.request
import json
import copy
from slugify import slugify

%matplotlib inline

In [35]:
credits = pd.read_csv('./data/movies.csv')
movie_titles_creds = np.array(credits["title"].tolist())
movies = pd.read_csv('./data/movies_data.csv')
movie_titles_movies = np.array(movies["title"].tolist())

gender_mapping = {0 : "Other", 1 : "Female", 2 : "Male"}

ethnicities = pd.read_csv('./data/ethnicelebs.csv', header = None)
actor_names = ethnicities[0].tolist()
actor_ethnicities = ethnicities[1].tolist()

races = pd.read_csv('./data/ethnicities_to_races.csv')
ethnicity_mapping = np.array(races["ethnicity"].tolist())
race_mapping = np.array(races["race"].tolist())

In [36]:
def parse_transcript(filename):

    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")
    line_list = []
    transcript = []
    characters = []
    characters2 = []

    text_file = open(filename, "r")
    lines = text_file.readlines()
    text_file.close()

    num_lines = 0;
    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "" and num_lines != 0:
            c = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            if (c in characters) and (c not in characters2):
                characters2.append(c)
            if c.isupper() and (not li.endswith(" POV")) and ("INT." not in l) and ("EXT." not in l) and ("--" not in l) and ("_" not in l) and ("- DAY" not in l) and ("INTERIOR" not in l) and ("CLOSE ON" != li) and ("CUT TO" != li) and ("EXTERIOR" not in l) and ("NSERT " not in l) and ("BACK TO " not in l) and ("ACTION " not in l) and ("OMITTED" not in l) and ('LATER THAT NIGHT -' not in l) and ("ANOTHER ANGLE" not in l) and ("IN THE CAR" not in l) and ("IN THE LOT" not in l) and ("ACROSS THE " not in l) and ("THE END" not in l) and ("END CREDITS" not in l) and ("FADE OUT" not in l) and (":" not in l) and ("!" not in l) and ("?" not in l) and ('"' not in l) and ("NEW ANGLE" != li) and ("CLOSEUP" not in l) and ("ANGLE ON TV" != li) and (not c.endswith(".")) and (c not in characters):
                characters.append(c)
        elif li != "": 
            num_lines = 1

    speaker = "";
    utterance = "";
    still_speaking = True
    second_time = False
    previous_spaces = 0

    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "": 
            spmatch = spaces_regex.search(l)
            spaces_number = len(spmatch.group(1))
            ch = re.sub(r'\([^()]*\)', '', re.sub(r'\[[^()]*\]', '', li)).strip(' \n\t\r')
            if ch == "" or ch.startswith("(") or ch.endswith(")"):
                ch = ""
            elif (("INT." in l or "EXT." in l) and utterance != "") or li == "THE END":
                transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                speaker = ""
                utterance = ""
                second_time = False 
            elif ch in characters2:
                if utterance != "" and speaker != ch and speaker != "":
                    second_time = False
                    transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                    utterance = ""
                elif speaker == ch:
                    second_time = True
                speaker = ch;
                still_speaking = True
                previous_spaces = 0
            elif still_speaking == True and speaker != "":
                if (spaces_number == previous_spaces or previous_spaces == 0):
                    utterance += " " + li
                    previous_spaces = spaces_number;
                second_time = False                
        elif utterance == "" and speaker != "":
            still_speaking = True
        elif utterance != "" and second_time == False:
            still_speaking = False
    return transcript

In [37]:
def get_lines(transcript):
    line_dict = dict()
    
    for i in np.arange(len(transcript)):
        speaker = transcript[i]['speaker']
        line = transcript[i]['utterance']
        if speaker in line_dict.keys():
            line_dict[speaker] += [line]
        else:
            line_dict[speaker] = [line]
    
    return line_dict

In [38]:
def get_movie_metadata(name, script):
    metadata = dict()
    movie_num = np.where(movie_titles_movies == name)
    metadata["name"] = np.array(movies["title"].tolist())[movie_num][0]
    genres = json.loads(np.array(movies["genres"].tolist())[movie_num][0])
    metadata["genres"] = [x["name"].lower() for x in genres]
    fname = script[:len(script)-4]
    metadata["script"] = fname[fname.rfind('/')+1:].lower()
    metadata["id"] = int(np.array(movies["id"].tolist())[movie_num][0])
    release_year = np.array(movies["release_date"].tolist())[movie_num][0].split('-')[0]
    metadata["release_yr"] = release_year
    metadata["rating"] = str(np.array(movies["vote_average"].tolist())[movie_num][0])
    metadata["budget"] = str(np.array(movies["budget"].tolist())[movie_num][0])
    metadata["box_office"] = str(np.array(movies["revenue"].tolist())[movie_num][0])
    metadata["synopsis"] = np.array(movies["overview"].tolist())[movie_num][0]
    metadata["num_awards"] = 0
    
    poster_title = metadata["name"].lower()
    poster_title = poster_title.replace(":", "%3c")
    poster_title = poster_title.replace("&", "%26")
    poster_title = poster_title.replace("/", "%2f")
    poster_title = poster_title.replace(",", "%2c")
    poster_title = poster_title.replace("+", "%2b")
    poster_title = '+'.join(poster_title.split(' '))
    omdb_request = "http://omdbapi.com/?apikey=" + MY_KEY + "&t=" + poster_title + "&y=" + release_year
    
    try:
        with urllib.request.urlopen(omdb_request) as url:
            omdb_results = json.loads(url.read().decode())
        metadata["review_score"] = omdb_results["imdbRating"]
        metadata["poster_image_url"] = omdb_results["Poster"]
        metadata["imdb_url"] = "https://www.imdb.com/title/" + omdb_results["imdbID"]
    except:
        metadata["review_score"] = "N/A"
        metadata["poster_image_url"] = "N/A"
        metadata["imdb_url"] = "N/A"
        
    metadata["slug"] = re.sub(r'[-\s]+', '-', (re.sub(r'[^\w\s-]', '',metadata["name"]).strip().lower()))

    return metadata

In [39]:
def get_cast_and_crew(name):
    movie_num = np.where(movie_titles_creds == name)
    cast = np.array(credits["cast"].tolist())[movie_num][0]
    crew = np.array(credits["crew"].tolist())[movie_num][0]
    return cast, crew

In [40]:
def get_actor_metadata(cast):
    metadata = dict()
    
    char_list = json.loads(cast)
    for person in char_list:
        metadata[person['name']] = {'actor_id' : person['cast_id'], 'char_name' : person['character']}
    return metadata

In [41]:
def get_crew_metadata(crew):
    metadata = dict()
    crew_list = json.loads(crew)
    for person in crew_list:
        metadata[person['name']] = {'crew_id' : person['id'], 'job_name' : person['job']}
    return metadata

In [42]:
def hashfeatures(baby, B, FIX):
    v = np.zeros(B)
    for m in range(FIX):
        featurestring = "prefix" + baby[:m]
        v[hash(featurestring) % B] = 1
        featurestring = "suffix" + baby[-1*m:]
        v[hash(featurestring) % B] = 1
    return v

def name2features(filename, B=104729, FIX=5, LoadFile=True):
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, B))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], B, FIX)
    return X

def genTrainFeatures(dimension=128, fix=3):
    Xgirls = name2features("./girls.train", B=dimension, FIX=fix)
    Xboys = name2features("./boys.train", B=dimension, FIX=fix)
    X = np.concatenate([Xgirls, Xboys])
    Y = np.concatenate([-np.ones(len(Xgirls)), np.ones(len(Xboys))])
    ii = np.random.permutation([i for i in range(len(Y))])
    return X[ii, :], Y[ii]
X,Y = genTrainFeatures(128)

def naivebayesPY(x,y):
    y = np.concatenate([y, [-1,1]])
    n = len(y)
    unique_elements, counts_elements = np.unique(y, return_counts=True)
    neg = counts_elements[0] / n
    pos = counts_elements[1] / n
    return pos, neg
pos,neg = naivebayesPY(X,Y)

def naivebayesPXY(x,y):
    n, d = x.shape
    x = np.concatenate([x, np.ones((2,d))])
    y = np.concatenate([y, [-1,1]])
    n, d = x.shape
    x_pos = x[np.where(y==1)]
    x_neg = x[np.where(y==-1)]
    x_pos_sum = np.sum(x_pos, axis=0)
    x_neg_sum = np.sum(x_neg, axis=0)
    x_pos_sum_denom = np.sum(x_pos_sum)
    x_neg_sum_denom = np.sum(x_neg_sum)
    posprob = np.array([x_pos_sum/x_pos_sum_denom])
    negprob = np.array([x_neg_sum/x_neg_sum_denom])
    return posprob, negprob
posprob,negprob = naivebayesPXY(X,Y)

def naivebayes(x,y,xtest):
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    return (np.sum(xtest * log_theta_pos) + np.log(ppos) - (np.sum(xtest * log_theta_neg) + np.log(pneg)))
p = naivebayes(X,Y,X[0,:])

def naivebayesCL(x,y):
    n, d = x.shape
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    w = log_theta_pos - log_theta_neg
    b = np.log(ppos) - np.log(pneg)
    return w,b
w,b = naivebayesCL(X,Y)

def classifyLinear(x,w,b=0):
    w = w.reshape(-1)
    class_raw = np.dot(w.T, x.T) + b
    result = np.array(np.sign(class_raw))
    np.place(result, result == 0, [-1])
    return result
DIMS = 128
X,Y = genTrainFeatures(DIMS)
w,b=naivebayesCL(X,Y)
error = np.mean(classifyLinear(X,w,b) != Y)

def classify_name(name):
    name = name.capitalize()
    xtest = name2features(name, B = DIMS, LoadFile = False)
    pred = classifyLinear(xtest,w,b)[0]
    if pred > 0:
        return "Male"
    else:
        return "Female"

In [43]:
def get_gender_dict(cast, lines):
    char_list = json.loads(cast)
    gender_dict = dict()
    classified_gender_dict = dict()
    
    some_counter = 0
    
    for speaker in lines.keys():
        some_counter += 1
        found_match = False
        for d in char_list:
            char = [x.lower() for x in d['character'].split(' ')]
            found_character = speaker.lower() == d['character'].lower() or speaker.lower() in char
            found_speaker = any(x == speaker.lower() for x in char) or any(x in speaker.lower() for x in char)
            found_match = found_character or found_speaker
            if found_match:
                overall_gender_dict = dict()
                char_gender = classify_name(d['character'])
                speaker_gender = classify_name(speaker)
                cast_gender = gender_mapping[d['gender']]
                
                if char_gender in overall_gender_dict.keys():
                    overall_gender_dict[char_gender] += 1
                else:
                    overall_gender_dict[char_gender] = 1
                if speaker_gender in overall_gender_dict.keys():
                    overall_gender_dict[speaker_gender] += 1
                else:
                    overall_gender_dict[speaker_gender] = 1
                if cast_gender in overall_gender_dict.keys():
                    overall_gender_dict[cast_gender] += 1
                else:
                    overall_gender_dict[cast_gender] = 1
                    
                overall_gender = max(overall_gender_dict.keys(), key=(lambda k: overall_gender_dict[k]))
                
                if cast_gender != "Other" and overall_gender != cast_gender:
                    overall_gender = cast_gender
                    
                if d['character'] not in gender_dict.keys():
                    gender_dict[d['character']] = speaker, d['name'], overall_gender, d['cast_id']
                else:
                    gender_dict[d['character'] + str(some_counter)] = speaker, d['name'], overall_gender, d['cast_id']
                char_list.remove(d)
                break
        if not found_match:
            if any(x.lower() in ['man', 'boy', 'men', 'boys'] for x in speaker.split(' ')):
                if speaker not in gender_dict.keys() and speaker not in classified_gender_dict.keys():
                    classified_gender_dict[speaker] = speaker, 'N/A', 'Male', 'N/A'
                else:
                    classified_gender_dict[speaker + str(some_counter)] = speaker, 'N/A', 'Male', 'N/A'
            elif any(x.lower() in ['woman', 'girl', 'women', 'girls'] for x in speaker.split(' ')):
                if speaker not in gender_dict.keys() and speaker not in classified_gender_dict.keys():
                    classified_gender_dict[speaker] = speaker, 'N/A', 'Female', 'N/A'
                else:
                    classified_gender_dict[speaker + str(some_counter)] = speaker, 'N/A', 'Female', 'N/A'
            else:
                final_gender = classify_name(speaker)
                if speaker not in gender_dict.keys() and speaker not in classified_gender_dict.keys():
                    classified_gender_dict[speaker] = speaker, 'N/A', final_gender, 'N/A'
                else:
                    classified_gender_dict[speaker + str(some_counter)] = speaker, 'N/A', final_gender, 'N/A'
            
    new_gender_dict = copy.deepcopy(gender_dict)
    new_gender_dict.update(classified_gender_dict)
    
    return gender_dict, new_gender_dict

In [44]:
def get_crew_gender_dict(crew):
    crew_list = json.loads(crew)
    gender_dict = dict()
    
    for member in crew_list:
        name = member['name']
        given_gender = gender_mapping[member['gender']]
        classified_gender = classify_name(name)
        gender = given_gender
        if gender == "Other":
            gender = classified_gender
        
        gender_dict[name] = gender
    
    return gender_dict

In [45]:
def get_ethnicity_dict(genders):
    ethnicity_dict = dict()
    for character in genders.keys():
        speaker, name, gender, actor_id = genders[character]
        actor = '-'.join(name.lower().split(' '))
        try:
            actor_ethnicity = actor_ethnicities[actor_names.index(actor)]
            if(type(actor_ethnicity) != str):
                actor_ethnicity = "N/A"
            ethnicity_dict[character] = speaker, name, actor_ethnicity
        except ValueError:
            pass
        
    return ethnicity_dict

In [46]:
def analyze_gender(lines, genders):
    by_line = dict()
    by_char = dict()
    
    total_lines = 0
    total_chars = len(genders.keys())
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        if gender in by_line.keys():
            by_line[gender] += num_lines
            by_char[gender] += 1
        else:
            by_line[gender] = num_lines
            by_char[gender] = 1
            
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
        
    try:
        by_line["Male"] = by_line["Male"]
    except KeyError:
        by_line["Male"] = 0
        
    try:
        by_line["Female"] = by_line["Female"]
    except KeyError:
        by_line["Female"] = 0
        
    try:
        by_line["Other"] = by_line["Other"]
    except KeyError:
        by_line["Other"] = 0
        
    try:
        by_char["Male"] = by_char["Male"]
    except KeyError:
        by_char["Male"] = 0
        
    try:
        by_char["Female"] = by_char["Female"]
    except KeyError:
        by_char["Female"] = 0
        
    try:
        by_char["Other"] = by_char["Other"]
    except KeyError:
        by_char["Other"] = 0
        
    by_line = {"Male" : by_line["Male"], "Female" : by_line["Female"], "Other" : by_line["Other"]}
    by_char = {"Male" : by_char["Male"], "Female" : by_char["Female"], "Other" : by_char["Other"]}
    
    return by_line, by_char

In [47]:
def analyze_crew_gender(crew_genders):
    total_crew = len(crew_genders.keys())
    by_member = dict()
    
    for member in crew_genders.keys():
        gender = crew_genders[member]
        
        if gender in by_member.keys():
            by_member[gender] += 1
        else:
            by_member[gender] = 1
            
    for member in by_member.keys():
        by_member[member] = round(by_member[member] / total_crew, 2)
        
    try:
        by_member["Male"] = by_member["Male"]
    except KeyError:
        by_member["Male"] = 0
        
    try:
        by_member["Female"] = by_member["Female"]
    except KeyError:
        by_member["Female"] = 0
        
    try:
        by_member["Other"] = by_member["Other"]
    except KeyError:
        by_member["Other"] = 0
        
    by_member = {"Male" : by_member["Male"], "Female" : by_member["Female"], "Other" : by_member["Other"]}
    
    return by_member

In [48]:
def analyze_ethnicity(lines, ethnicities):
    by_line = dict()
    by_char = dict()
    race_dict = dict()
    
    total_lines = 0
    total_chars = len(ethnicities.keys())
    
    for char in ethnicities.keys():
        speaker, name, ethnicity = ethnicities[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        char_ethnicities = re.findall(r'[a-zA-Z]+', ethnicity)
        char_ethnicities = [x for x in char_ethnicities if x[0].isupper()]
        char_race = set()
         
        for e in char_ethnicities:
            try:
                race_num = np.where(ethnicity_mapping == e)
                races = race_mapping[race_num]
                if len(races) > 0:
                    char_race.add(races[0])
            except Error:
                pass
        
        race_dict[char] = ", ".join(char_race)     
        
        
        for race in char_race:
            if race in by_line.keys():
                by_line[race] += num_lines
                by_char[race] += 1
            else:
                by_line[race] = num_lines
                by_char[race] = 1
                
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
            
    return by_line, by_char, race_dict

In [49]:
def find_next_female_speaker(transcript, genders, start_loc):
    #genders = bechdel_genders in analyze_bechdel
    #start_loc = where in transcript to start looking
    #returns index of transcript with next female speaker if exists and -1 otherwise
    
    if start_loc >= len(transcript):
        return -1
    
    else:
        curr_transcript = transcript[start_loc:]
        offset = 0
        while offset < len(curr_transcript):
            curr_speaker = curr_transcript[offset]['speaker']
            curr_gender = genders[curr_speaker]
            if curr_gender != "Female":
                offset += 1
            else:
                break
        if offset == len(curr_transcript):
            return -1
        else:
            return start_loc + offset

In [77]:
def find_female_conversation(transcript, genders, start_loc):
    
    if start_loc >= len(transcript):
        return [], -1
    curr_transcript = transcript[start_loc:]
    
    index = 0
    conversation = []
    names = set()
    while index < len(curr_transcript):
        curr_speaker = curr_transcript[index]['speaker']
        curr_gender = genders[curr_speaker]
        if curr_gender == "Female":
            names.add(curr_speaker)
        if curr_gender != "Female":
            if len(conversation) <= 1 or len(names) < 2:
                return [], start_loc + index
            else:
                return conversation, start_loc + index
        else:
            conversation.append(curr_transcript[index])
            index += 1
    
    if len(conversation) <= 1:
        return [], start_loc + index
    else:
        return conversation, start_loc + index

In [78]:
def analyze_conversation(conversation, genders):
    
    genders_by_speaker = dict()
    genders_by_character = dict()
    
    for name in genders.keys():
        speaker, _, gender, _ = genders[name]
        genders_by_speaker[speaker] = gender
        genders_by_character[name] = gender
    
    male_words = ['he', 'him', 'his', 'himself', 'boy', 'boys', 'man', 'men', 'husband', 'son', 'father', 'brother', 'dad']
    all_words = set() 
    for line in conversation:
        words = set(re.findall(r'\w+', line['utterance']))
        all_words.update(words)
        
    word_list = list(all_words)
    uppercase_words = [x for x in word_list if len(x) > 0 and x[0].isupper()]
    lowercase_words = [x.lower() for x in word_list]
    
    uses_pronouns = any(x in male_words for x in lowercase_words)
    uses_names = False
    
    for name in uppercase_words:
#         found_match = False
#         overall_gender = ""
        
#         for d in genders_by_speaker.keys():
#             char = [x.lower() for x in d.split(' ')]
#             found_character = name.lower() == d.lower() or name.lower() in char
#             found_speaker = any(x == name.lower() for x in char) or any(x in name.lower() for x in char)
#             found_match = found_character or found_speaker
#             if found_match:
#                 overall_gender = genders_by_speaker[d]
#                 del genders_by_speaker[d]
#                 break

#         if overall_gender == 'Male':
#             uses_names = True
#             break
            
#         if not found_match:
#             for d in genders_by_character.keys():
#                 char = [x.lower() for x in d.split(' ')]
#                 found_character = name.lower() == d.lower() or name.lower() in char
#                 found_speaker = any(x == name.lower() for x in char) or any(x in name.lower() for x in char)
#                 found_match = found_character or found_speaker
#                 if found_match:
#                     overall_gender = genders_by_character[d]
#                     del genders_by_character[d]
#                     break
                    
#         if overall_gender == 'Male':
#             uses_names = True
#             break
            
#         if not found_match:
#             if any(x.lower() in ['man', 'boy', 'men', 'boys'] for x in name.split(' ')):
#                 overall_gender = 'Male'
#             elif any(x.lower() in ['woman', 'girl', 'women', 'girls'] for x in speaker.split(' ')):
#                 overall_gender = 'Female'
#             else:
#                 overall_gender = classify_name(name)
                
        if any(x.lower() in ['man', 'boy', 'men', 'boys'] for x in name.split(' ')):
            overall_gender = 'Male'
        elif any(x.lower() in ['woman', 'girl', 'women', 'girls'] for x in speaker.split(' ')):
            overall_gender = 'Female'
        else:
            overall_gender = classify_name(name)
            
        if overall_gender == 'Male':
            uses_names = True
            break
    
    return not (uses_pronouns or uses_names)

In [79]:
def analyze_bechdel(transcript, genders):
    
    bechdel_genders = dict()
    
    for g in genders.keys():
        speaker, _, gender, _ = genders[g]
        bechdel_genders[speaker] = gender
        
        
    if(len(transcript) == 0):
        return dict(), False
    
    else:
        index = find_next_female_speaker(transcript, bechdel_genders, 0)
        while(index != -1 and index < len(transcript)):
            conversation, end_index = find_female_conversation(transcript, bechdel_genders, index)
            end_index += 1
            if len(conversation) == 0:
                index = find_next_female_speaker(transcript, bechdel_genders, end_index)
            else:
                valid_conversation = analyze_conversation(conversation, genders)
                if valid_conversation:
                    return conversation, True
                else:
                    index = find_next_female_speaker(transcript, bechdel_genders, end_index)
        return dict(), False

In [80]:
def analyze_screentime(lines, genders):
    total_lines = 0
    screentime_dict = dict()
    
    for char in genders.keys():
        speaker, name, _, _ = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        screentime_dict[name] = num_lines
    
    for name in screentime_dict.keys():
        screentime_dict[name] = round(screentime_dict[name] / total_lines, 2)
        
    return screentime_dict

In [81]:
def get_char_metadata(lines, genders, races, screen_time):
    
    metadata = dict()
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        race = "N/A"
        if char in races.keys():
            race = races[char]
        time = screen_time[name]
        metadata[name] = {"actor_id" : str(actor_id), "char_name" : speaker.capitalize(), "screen_time" : time, "race" : race, "gender" : gender}
    return metadata

In [82]:
def get_distr_metadata(g_line, g_char, g_crew, r_line, r_char):
    metadata = dict()
    
    metadata["gender_dist"] = {
        "by_movie": {k.lower(): v for k, v in g_char.items()},
        "by_line": {k.lower(): v for k, v in g_line.items()},
        "by_crew" : {k.lower(): v for k, v in g_crew.items()}}
    metadata["race_dist"] = {
        "by_movie": {k.lower(): v for k, v in r_char.items()},
        "by_line": {k.lower(): v for k, v in r_line.items()}
    }
    metadata["stereotype_dist"] = {"by_movie" : [["Stereotypical", 0], ["Not stereotypical", 1]],
                                   "by_line" : [["Stereotypical", 0], ["Not stereotypical", 1]]}
    
    return metadata

In [83]:
def get_bechdel_metadata(transcript, genders):
    metadata = dict()
    
    conversation, passes = analyze_bechdel(transcript, genders)
    metadata["passes"] = passes
    metadata["conversation"] = conversation
    
    return metadata

In [84]:
def get_metadata_json(movie, script_path):
    """
    writes movie json to ./data/parsed_scripts/[movie_slug].json
    """

    movie_metadata = get_movie_metadata(movie, script_path)    
    
    movie_cast, movie_crew = get_cast_and_crew(movie)
    actor_metadata = get_actor_metadata(movie_cast)
    crew_metadata = get_crew_metadata(movie_crew)
    transcript = parse_transcript(script_path)
    line_dict = get_lines(transcript)
    gender_dict, gender_dict_by_lines = get_gender_dict(movie_cast, line_dict)
    
    crew_gender_dict = get_crew_gender_dict(movie_crew)
    ethnicity_dict = get_ethnicity_dict(gender_dict)
    gender_by_line, gender_by_char = analyze_gender(line_dict, gender_dict_by_lines)
    gender_by_crew = analyze_crew_gender(crew_gender_dict)
    race_by_line, race_by_char, race_dict = analyze_ethnicity(line_dict, ethnicity_dict)
    screen_time = analyze_screentime(line_dict, gender_dict)
    
    
    char_metadata = get_char_metadata(line_dict, gender_dict, race_dict, screen_time)
    distribution_metadata = get_distr_metadata(gender_by_line, gender_by_char, gender_by_crew, race_by_line, race_by_char)
    bechdel_metadata = get_bechdel_metadata(transcript, gender_dict_by_lines)
    
    print(bechdel_metadata)
    
    metadata = {"movie_metadata" : movie_metadata, 
                "actor_metadata" : actor_metadata,
                "crew_metadata" : crew_metadata,
                "char_metadata" : char_metadata, 
                "distribution_metadata" : distribution_metadata,
                "bechdel_metadata" : bechdel_metadata}
    
    file = script_path[:-4]
    file = os.path.dirname(file).replace("/scripts", "/parsed_scripts") + "/%s.json" % movie_metadata["slug"]
    
    target_dir = os.path.dirname(file)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(file, 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)
    
    # also write to app data
    with open("../app/data/movies/%s.json" % movie_metadata["slug"], 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)

In [85]:
MY_KEY = "cef453b6"

transcripts = os.listdir("./data/scripts")

def parse_title(title):
    title = title[:-4].replace("-", " ")
    if title[-5:] == ", The":
        title = "The " + title[:-5] 
    return title

tran_movies = [parse_title(title) for title in transcripts]

with open('./data/movies.txt') as f:
    all_movies = f.read().splitlines()
    
valid_movies = set([])
for index in np.arange(len(all_movies)):
    for movie in all_movies:
        clean_tran_movie = re.sub(r'\W+', '', all_movies[index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            valid_movies.add(movie)

for tran_movie_index in np.arange(len(tran_movies)):
    matching_movie = ""
    for movie in valid_movies:
        clean_tran_movie = re.sub(r'\W+', '', tran_movies[tran_movie_index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            matching_movie = movie
            break
    if matching_movie != "":
        SCRIPT_PATH = "./data/scripts/" + transcripts[tran_movie_index]
        MOVIE_NAME = matching_movie
        get_metadata_json(MOVIE_NAME, SCRIPT_PATH)

{'passes': True, 'conversation': [{'speaker': 'KAT', 'utterance': 'I guess I got in'}, {'speaker': 'SHARON', 'utterance': "What's a synonym for throbbing?"}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'KRISTI', 'utterance': '... you can be the highest person on the continent!'}, {'speaker': 'MEGAN', 'utterance': "Didn't you see it?"}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'WILL', 'utterance': "I can't tell, you're wearing a mask."}, {'speaker': 'JAY', 'utterance': 'Hey...on your forehead.'}]}
{'passes': True, 'conversation': [{'speaker': 'SMITH', 'utterance': "I'd be sick at a swing like that, too."}, {'speaker': 'OLDER WOMAN', 'utterance': 'Are you alright, honey?'}]}
{'passes': True, 'conversation': [{'speaker

{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'ELLIOTT AND SHAUNA', 'utterance': "C'mon, it's her first time!"}, {'speaker': 'ANNA', 'utterance': "A ca vas! C'est bon!"}]}
{'passes': True, 'conversation': [{'speaker': 'LOTTE', 'utterance': 'You know, maybe you should speak to someone about this.'}, {'speaker': 'WOMAN #1', 'utterance': 'Seven and a half, right?'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'SETHE', 'utterance': 'What might your name be?'}, {'speaker': 'BELOVED', 'utterance': 'Beloved.'}]}
{'passes': True, 'conversation': [{'speaker': 'CAROL', 'utterance': 'I beg your pardon?'}, {'speaker': 'MADGE', 'utterance': 'Oh god.'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker'

{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'N', 'utterance': '7.'}, {'speaker': 'S', 'utterance': 'hit!'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'UGGA', 'utterance': 'Ooh, more fire babies.'}, {'speaker': 'EEP', 'utterance': 'Hi.'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'NGUNDA OTI', 'utterance': '... My first wife and I are captured by neighbor tribe, cannibals... I escaped across the river...'}, {'speaker': 'AN OLD WOMAN', 'utterance': 'You escaped cannibals?'}, {'speaker': 'NGUNDA OTI', 'utterance': "My wife, she can't swim, so she eaten."}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': '

{'passes': True, 'conversation': [{'speaker': 'HARRY', 'utterance': 'I gotta go now.'}, {'speaker': 'FRANCES', 'utterance': 'Harry, please!'}]}
{'passes': True, 'conversation': [{'speaker': 'E', 'utterance': '.. May I have a cookie please?'}, {'speaker': 'MOM', 'utterance': 'Oh- yes. Of course.'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'SANDY', 'utterance': "Drive careful and don't drownd your dumb self."}, {'speaker': 'BRENDA', 'utterance': 'You think they fell asleep?'}]}
{'passes': True, 'conversation': [{'speaker': 'TAMARA', 'utterance': 'What are you talking about?'}, {'speaker': 'MCCULLOCB', 'utterance': "You never turned in your final biology project, so I've had your diploma rescinded."}, {'speaker': 'TAMARA', 'utterance': "You can't do that..."}]}
{'passes': True, 'conversation': [{'speaker': 'AMY', 'utterance': 'Now?'}, {'speaker': 'JANE', 'utterance': 'I 

{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'ANOTHER EXECUTIVE', 'utterance': 'A proxy!'}, {'speaker': 'YET ANOTHER EXECUTIVE', 'utterance': 'A pawn!'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'HOLOGRAM', 'utterance': 'Someone is always watching.'}, {'speaker': 'CALVIN', 'utterance': 'You asked for me?'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'JULIE', 'utterance': 'looks out the window with glazed-over eyes.'}, {'speaker': 'MAC', 'utterance': "Jeez, that's a full carat --"}]}
{'passes': True, 'conversation': [{'speaker': 'JUDGE MCNEILY', 'utterance': "I'll allow it."}, {'speaker': 'ANNIE', 'utterance': 'I had twenty eight years in the world to observe all kinds of fathers.'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversatio

{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'SHONDRA', 'utterance': '... aaaaaa...'}, {'speaker': 'BESS', 'utterance': '... aaaaaa...'}]}
{'passes': True, 'conversation': [{'speaker': 'PINTA', 'utterance': 'Come on.   Come on.   Bird!'}, {'speaker': "PINTA'S VOICE", 'utterance': 'CREASY!'}]}
{'passes': True, 'conversation': [{'speaker': 'MICHAEL', 'utterance': 'Andy...!'}, {'speaker': 'JANICE', 'utterance': 'Why is Andy doing this?  Why?... Why?'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'EMILY', 'utterance': 'Are you fucking kidding me?'}, {'speaker': 'LISA', 'utterance': 'Why does this not shock me?'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'SHOOTING DRAFT', 'utterance': 'MARLEY & ME 66'}, {'speaker': 'JENNY', 'utterance': 'KNOW'}]}
{'passes': True, 'conversation': [{'speaker': 'LUCY', 'utterance': 'Hello. Hello?'}, {

{'passes': True, 'conversation': [{'speaker': 'ELIZABETH', 'utterance': 'You know, Peggy Sue, your mother said you had a dream that I died.'}, {'speaker': 'PEGGY', 'utterance': "I wish she hadn't."}, {'speaker': 'ELIZABETH', 'utterance': "I'm not afraid. I know exactly when I'm going to die."}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'RACHEL', 'utterance': "I think I'll sit this one out, doc."}, {'speaker': 'ELLIE', 'utterance': 'I want to look around, daddy-- may I?'}]}
{'passes': True, 'conversation': [{'speaker': 'ANTHEA', 'utterance': "Didn't expect to see you... ?"}, {'speaker': 'SHELBY', 'utterance': "I'm looking through the files but..."}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'THE YOUNG WOMAN', 'utterance': 'Why did I do it? Why did I do it? Why did I do it? Why did I do it?'}, {'speaker': "A MAN'S VOICE", 'utterance': "I'm telling you, it's a disgra

{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'GRANT', 'utterance': 'You ready, sugarplum?'}, {'speaker': 'STARLA', 'utterance': 'Bye.'}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'GIRL', 'utterance': "I'm 20."}, {'speaker': 'ASHLEIGH', 'utterance': "I'm 19,"}]}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'IKE', 'utterance': 'Papa mama simi.'}, {'speaker': 'CARTMAN', 'utterance': 'Come on, just get to the message board!'}]}
{'passes': True, 'conversation': [{'speaker': 'FLOR', 'utterance': 'Cristina..'}, {'speaker': 'CRISTINA', 'utterance': 'My mother wishes me to repr

{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'KELLY', 'utterance': "I can't believe you called Sam.  What's the matter with you?"}, {'speaker': 'SUZIE', 'utterance': "I'm scared, that's what.  I'm scared there's no one to trust."}, {'speaker': 'KELLY', 'utterance': 'You can trust me.'}]}
{'passes': True, 'conversation': [{'speaker': 'RITA', 'utterance': 'You know you could put a gun on that thing.'}, {'speaker': 'MUNITIA', 'utterance': 'I have them square in my sights, sire.'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'DOROTHY', 'utterance': 'Oh!'}, {'speaker': 'WITCH', 'utterance': '...hear of it. Why, my little par-....'}]}
{'passes': True, 'conversation': [{'speaker': 'CLIENT #2', 'utterance': 'Give me 300 shares.'}, {'speaker': 'CLIENT #3', 'utterance': '1200 shares.'}]}
{'passes': False, 'conversation': {}}
{'passes': True, 'conversation': [{'speaker': 'OPERATOR', 'utterance': 'Negative, comrade.'},