In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import parse
import json
import copy

In [2]:
credits = pd.read_csv('./data/tmdb_5000_credits.csv')
movie_titles = np.array(credits["title"].tolist())

gender_mapping = {0 : "other", 1 : "female", 2 : "male"}

ethnicities = pd.read_csv('./data/ethnicelebs.csv', header = None)
actor_names = ethnicities[0].tolist()
actor_ethnicities = ethnicities[1].tolist()

races = pd.read_csv('./data/ethnicities_to_races.csv')
ethnicity_mapping = np.array(races["ethnicity"].tolist())
race_mapping = np.array(races["race"].tolist())

SCRIPT_PATH = "./data/scripts/Titanic.txt"
MOVIE_NAME = "Titanic"

In [3]:
def parse_transcript(filename):
    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")
    line_list = []
    options = [0] * 100
    opt = []
    transcript = []

    text_file = open(filename, "r")
    lines = text_file.readlines()
    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "":
            spmatch = spaces_regex.search(l)
            spaces_number = len(spmatch.group(1))
            options[spaces_number] += 1
            line_list.append((li, spaces_number))
    for i in range(0, 50):
        if options[i] >= 20:
            opt.append(i)
    text_file.close()

    speaker = ""
    utterance = ""

    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "":
            spmatch = spaces_regex.search(l)
            spaces_number = len(spmatch.group(1))
            if spaces_number == opt[2]:
                if utterance != "" and speaker != "":
                    transcript.append(
                        {'speaker': speaker, 'utterance': utterance.strip()})
                    utterance = ""
                speaker = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            elif spaces_number == opt[1]:
                utterance += " " + li
            else:
                if utterance != "" and speaker != "":
                    transcript.append(
                        {'speaker': speaker, 'utterance': utterance.strip()})
                    utterance = ""
                    speaker = ""

    return transcript

In [4]:
def get_lines(script):
    transcript = parse_transcript(script)
    line_dict = dict()
    
    for i in np.arange(len(transcript)):
        speaker = transcript[i]['speaker']
        line = transcript[i]['utterance']
        if speaker in line_dict.keys():
            line_dict[speaker] += [line]
        else:
            line_dict[speaker] = [line]
    return line_dict

In [5]:
def get_cast(name):
    movie_num = np.where(movie_titles == name)
    cast = np.array(credits["cast"].tolist())[movie_num][0]
    return cast

In [6]:
def get_gender_dict(cast, lines):
    char_list = json.loads(cast)
    gender_dict = dict()
    
    for speaker in lines.keys():
        for d in char_list:
            char = [x.lower() for x in d['character'].split(' ')]
            found_character = speaker.lower() == d['character'].lower() or speaker.lower() in char
            found_speaker = any(x == speaker.lower() for x in char) or any(x in speaker.lower() for x in char)
            if found_character or found_speaker:
                gender_dict[d['character']] = speaker, d['name'], gender_mapping[d['gender']], d['cast_id']
                char_list.remove(d)
                break
                
    return gender_dict

In [7]:
def get_ethnicity_dict(genders):
    ethnicity_dict = dict()
    for character in genders.keys():
        speaker, name, gender, actor_id = genders[character]
        actor = '-'.join(name.lower().split(' '))
        try:
            actor_ethnicity = actor_ethnicities[actor_names.index(actor)]
            ethnicity_dict[character] = speaker, name, actor_ethnicity
        except ValueError:
            pass
        
    return ethnicity_dict

In [8]:
def analyze_gender(lines, genders):
    by_line = dict()
    by_char = dict()
    
    total_lines = 0
    total_chars = len(genders.keys())
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        if gender in by_line.keys():
            by_line[gender] += num_lines
            by_char[gender] += 1
        else:
            by_line[gender] = num_lines
            by_char[gender] = 1
            
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
        
    return by_line, by_char

In [9]:
def analyze_ethnicity(lines, ethnicities):
    by_line = dict()
    by_char = dict()
    race_dict = dict()
    
    total_lines = 0
    total_chars = len(ethnicities.keys())
    
    for char in ethnicities.keys():
        speaker, name, ethnicity = ethnicities[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        char_ethnicities = re.findall(r'[a-zA-Z]+', ethnicity)
        char_ethnicities = [x for x in char_ethnicities if x[0].isupper()]
        char_race = set()
         
        for e in char_ethnicities:
            try:
                race_num = np.where(ethnicity_mapping == e)
                races = race_mapping[race_num]
                if len(races) > 0:
                    char_race.add(races[0])
            except Error:
                pass
        
        race_dict[char] = ", ".join(char_race)     
        
        
        for race in char_race:
            if race in by_line.keys():
                by_line[race] += num_lines
                by_char[race] += 1
            else:
                by_line[race] = num_lines
                by_char[race] = 1
                
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
            
    return by_line, by_char, race_dict

In [10]:
def analyze_screentime(lines, genders):
    total_lines = 0
    screentime_dict = dict()
    
    for char in genders.keys():
        speaker, name, _, _ = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        screentime_dict[name] = num_lines
    
    for name in screentime_dict.keys():
        screentime_dict[name] = round(screentime_dict[name] / total_lines, 2)
        
    return screentime_dict

In [11]:
def get_actor_metadata(lines, genders, races, screen_time):
    
    metadata = dict()
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        race = "N/A"
        if char in races.keys():
            race = races[char]
        time = screen_time[name]
        metadata[name] = {"actor_id" : str(actor_id), "char_name" : speaker.capitalize(), "screen_time" : time, "race" : race, "gender" : gender}
    return metadata

In [12]:
def get_distr_metadata(g_line, g_char, r_line, r_char):
    metadata = dict()
    
    metadata["gender_dist"] = {"by_movie" : g_char, "by_line" : g_line}
    metadata["race_dist"] = {"by_movie" : r_char, "by_line" : r_line}
    
    return metadata

In [25]:
def get_metadata_json(movie, script_path):

    movie_cast = get_cast(movie)
    line_dict = get_lines(script_path)
    gender_dict = get_gender_dict(movie_cast, line_dict)
    ethnicity_dict = get_ethnicity_dict(gender_dict)
    gender_by_line, gender_by_char = analyze_gender(line_dict, gender_dict)
    race_by_line, race_by_char, race_dict = analyze_ethnicity(line_dict, ethnicity_dict)
    screen_time = analyze_screentime(line_dict, gender_dict)
    actor_metadata = get_actor_metadata(line_dict, gender_dict, race_dict, screen_time)
    distribution_metadata = get_distr_metadata(gender_by_line, gender_by_char, race_by_line, race_by_char)
    metadata = {"actor_metadata" : actor_metadata, "distribution_metadata" : distribution_metadata}
    file = script_path[:len(script_path) - 3]
    file = file.replace("/data/scripts/", "/data_results/") + ".txt"
    with open(file, 'w') as outfile:
        json.dump(metadata, outfile)

In [26]:
get_metadata_json(MOVIE_NAME, SCRIPT_PATH)