In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import parse
import json
import copy
from slugify import slugify

In [None]:
credits = pd.read_csv('./data/movies.csv')
movie_titles_creds = np.array(credits["title"].tolist())
movies = pd.read_csv('./data/movies_data.csv')
movie_titles_movies = np.array(movies["title"].tolist())

gender_mapping = {0 : "Other", 1 : "Female", 2 : "Male"}

ethnicities = pd.read_csv('./data/ethnicelebs.csv', header = None)
actor_names = ethnicities[0].tolist()
actor_ethnicities = ethnicities[1].tolist()

races = pd.read_csv('./data/ethnicities_to_races.csv')
ethnicity_mapping = np.array(races["ethnicity"].tolist())
race_mapping = np.array(races["race"].tolist())

SCRIPT_PATH = "./data/scripts/Titanic.txt"
MOVIE_NAME = "Titanic"

In [None]:
def parse_transcript(filename):

    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")
    line_list = []
    transcript = []
    characters = []
    characters2 = []

    text_file = open(filename, "r")
    lines = text_file.readlines()
    text_file.close()

    num_lines = 0;
    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "" and num_lines != 0:
            c = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            if (c in characters) and (c not in characters2):
                characters2.append(c)
            if c.isupper() and ("INT." not in l) and ("EXT." not in l) and ("OMITTED" not in l) and ("ANOTHER ANGLE" not in l) and ("THE END" not in l) and ("END CREDITS" not in l) and ("FADE OUT" not in l) and (":" not in l) and ("!" not in l) and ("?" not in l) and ('"' not in l) and (not c.endswith(".")) and (c not in characters):
                characters.append(c)
        elif li != "": 
            num_lines = 1

    speaker = "";
    utterance = "";
    still_speaking = True
    second_time = False

    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "": 
            ch = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            if ch == "":
                ch = ""
            elif ch in characters2:
                if utterance != "" and speaker != ch and speaker != "":
                    second_time = False
                    transcript.append({'speaker': speaker, 'utterance': utterance.strip()})
                    utterance = ""
                elif speaker == ch:
                    second_time = True
                speaker = ch;
                still_speaking = True
            elif still_speaking == True and speaker != "":
                utterance += " " + li
                second_time = False
        elif utterance == "":
            still_speaking = True
        elif utterance != "" and second_time == False:
            still_speaking = False

    return transcript

In [None]:
def get_lines(script):
    transcript = parse_transcript(script)
    line_dict = dict()
    
    for i in np.arange(len(transcript)):
        speaker = transcript[i]['speaker']
        line = transcript[i]['utterance']
        if speaker in line_dict.keys():
            line_dict[speaker] += [line]
        else:
            line_dict[speaker] = [line]
    return line_dict

In [None]:
def get_movie_metadata(name, script):
    metadata = dict()
    movie_num = np.where(movie_titles_movies == name)
    metadata["name"] = np.array(movies["title"].tolist())[movie_num][0]
    genres = json.loads(np.array(movies["genres"].tolist())[movie_num][0])
    metadata["genres"] = [x["name"].lower() for x in genres]
    fname = script[:len(script)-4]
    metadata["script"] = fname[fname.rfind('/')+1:].lower()
    metadata["id"] = int(np.array(movies["id"].tolist())[movie_num][0])
    release_year = np.array(movies["release_date"].tolist())[movie_num][0].split('-')[0]
    metadata["release_yr"] = release_year
    metadata["rating"] = str(np.array(movies["vote_average"].tolist())[movie_num][0])
    metadata["review_score"] = "N/A"
    metadata["budget"] = str(np.array(movies["budget"].tolist())[movie_num][0])
    metadata["box_office"] = str(np.array(movies["revenue"].tolist())[movie_num][0])
    metadata["synopsis"] = np.array(movies["overview"].tolist())[movie_num][0]
    metadata["num_awards"] = 0
    metadata["poster_image_url"] = ""
    metadata["slug"] = re.sub(r'[-\s]+', '-', (re.sub(r'[^\w\s-]', '',metadata["name"]).strip().lower()))

    return metadata

In [None]:
def get_cast(name):
    movie_num = np.where(movie_titles_creds == name)
    cast = np.array(credits["cast"].tolist())[movie_num][0]
    return cast

In [None]:
def get_actor_metadata(cast):
    metadata = dict()
    
    char_list = json.loads(cast)
    for person in char_list:
        metadata[person['name']] = {'actor_id' : person['cast_id'], 'char_name' : person['character']}
    return metadata

In [None]:
def get_gender_dict(cast, lines):
    char_list = json.loads(cast)
    gender_dict = dict()
    
    for speaker in lines.keys():
        for d in char_list:
            char = [x.lower() for x in d['character'].split(' ')]
            found_character = speaker.lower() == d['character'].lower() or speaker.lower() in char
            found_speaker = any(x == speaker.lower() for x in char) or any(x in speaker.lower() for x in char)
            if found_character or found_speaker:
                gender_dict[d['character']] = speaker, d['name'], gender_mapping[d['gender']], d['cast_id']
                char_list.remove(d)
                break
                
    return gender_dict

In [None]:
def get_ethnicity_dict(genders):
    ethnicity_dict = dict()
    for character in genders.keys():
        speaker, name, gender, actor_id = genders[character]
        actor = '-'.join(name.lower().split(' '))
        try:
            actor_ethnicity = actor_ethnicities[actor_names.index(actor)]
            if(type(actor_ethnicity) != str):
                actor_ethnicity = "N/A"
            ethnicity_dict[character] = speaker, name, actor_ethnicity
        except ValueError:
            pass
        
    return ethnicity_dict

In [None]:
def analyze_gender(lines, genders):
    by_line = dict()
    by_char = dict()
    
    total_lines = 0
    total_chars = len(genders.keys())
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        if gender in by_line.keys():
            by_line[gender] += num_lines
            by_char[gender] += 1
        else:
            by_line[gender] = num_lines
            by_char[gender] = 1
            
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
        
    try:
        by_line["Male"] = by_line["Male"]
    except KeyError:
        by_line["Male"] = 0
        
    try:
        by_line["Female"] = by_line["Female"]
    except KeyError:
        by_line["Female"] = 0
        
    try:
        by_line["Other"] = by_line["Other"]
    except KeyError:
        by_line["Other"] = 0
        
    try:
        by_char["Male"] = by_char["Male"]
    except KeyError:
        by_char["Male"] = 0
        
    try:
        by_char["Female"] = by_char["Female"]
    except KeyError:
        by_char["Female"] = 0
        
    try:
        by_char["Other"] = by_char["Other"]
    except KeyError:
        by_char["Other"] = 0
        
    by_line = {"Male" : by_line["Male"], "Female" : by_line["Female"], "Other" : by_line["Other"]}
    by_char = {"Male" : by_char["Male"], "Female" : by_char["Female"], "Other" : by_char["Other"]}
        
    return by_line, by_char

In [None]:
def analyze_ethnicity(lines, ethnicities):
    by_line = dict()
    by_char = dict()
    race_dict = dict()
    
    total_lines = 0
    total_chars = len(ethnicities.keys())
    
    for char in ethnicities.keys():
        speaker, name, ethnicity = ethnicities[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        char_ethnicities = re.findall(r'[a-zA-Z]+', ethnicity)
        char_ethnicities = [x for x in char_ethnicities if x[0].isupper()]
        char_race = set()
         
        for e in char_ethnicities:
            try:
                race_num = np.where(ethnicity_mapping == e)
                races = race_mapping[race_num]
                if len(races) > 0:
                    char_race.add(races[0])
            except Error:
                pass
        
        race_dict[char] = ", ".join(char_race)     
        
        
        for race in char_race:
            if race in by_line.keys():
                by_line[race] += num_lines
                by_char[race] += 1
            else:
                by_line[race] = num_lines
                by_char[race] = 1
                
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
            
    return by_line, by_char, race_dict

In [None]:
def analyze_screentime(lines, genders):
    total_lines = 0
    screentime_dict = dict()
    
    for char in genders.keys():
        speaker, name, _, _ = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        screentime_dict[name] = num_lines
    
    for name in screentime_dict.keys():
        screentime_dict[name] = round(screentime_dict[name] / total_lines, 2)
        
    return screentime_dict

In [None]:
def get_char_metadata(lines, genders, races, screen_time):
    
    metadata = dict()
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        race = "N/A"
        if char in races.keys():
            race = races[char]
        time = screen_time[name]
        metadata[name] = {"actor_id" : str(actor_id), "char_name" : speaker.capitalize(), "screen_time" : time, "race" : race, "gender" : gender}
    return metadata

In [None]:
def get_distr_metadata(g_line, g_char, r_line, r_char):
    metadata = dict()
    
    metadata["gender_dist"] = {
        "by_movie": {k.lower(): v for k, v in g_char.items()},
        "by_line": {k.lower(): v for k, v in g_line.items()}}
    metadata["race_dist"] = {
        "by_movie": {k.lower(): v for k, v in r_char.items()},
        "by_line": {k.lower(): v for k, v in r_line.items()}
    }
    metadata["stereotype_dist"] = {"by_movie" : [["Stereotypical", 0], ["Not stereotypical", 1]],
                                   "by_line" : [["Stereotypical", 0], ["Not stereotypical", 1]]}
    
    return metadata

In [None]:
def get_metadata_json(movie, script_path):
    """
    writes movie json to ./data/parsed_scripts/[movie_slug].json
    """

    movie_metadata = get_movie_metadata(movie, script_path)    
    
    movie_cast = get_cast(movie)
    actor_metadata = get_actor_metadata(movie_cast)
    
    line_dict = get_lines(script_path)    
    gender_dict = get_gender_dict(movie_cast, line_dict)
    ethnicity_dict = get_ethnicity_dict(gender_dict)
    gender_by_line, gender_by_char = analyze_gender(line_dict, gender_dict)
    race_by_line, race_by_char, race_dict = analyze_ethnicity(line_dict, ethnicity_dict)
    screen_time = analyze_screentime(line_dict, gender_dict)
    
    char_metadata = get_char_metadata(line_dict, gender_dict, race_dict, screen_time)
    distribution_metadata = get_distr_metadata(gender_by_line, gender_by_char, race_by_line, race_by_char)
    
    metadata = {"movie_metadata" : movie_metadata, 
                "actor_metadata" : actor_metadata, 
                "char_metadata" : char_metadata, 
                "distribution_metadata" : distribution_metadata}
    
    file = script_path[:-4]
    file = os.path.dirname(file).replace("/scripts", "/parsed_scripts") + "/%s.json" % movie_metadata["slug"]
    
    target_dir = os.path.dirname(file)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(file, 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)

In [None]:
transcripts = os.listdir("./data/scripts")

def parse_title(title):
    title = title[:-4].replace("-", " ")
    if title[-5:] == ", The":
        title = "The " + title[:-5] 
    return title

tran_movies = [parse_title(title) for title in transcripts]

with open('./data/movies.txt') as f:
    all_movies = f.read().splitlines()
    
valid_movies = set([])
for index in np.arange(len(all_movies)):
    for movie in all_movies:
        clean_tran_movie = re.sub(r'\W+', '', all_movies[index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            valid_movies.add(movie)

for tran_movie_index in np.arange(len(tran_movies)):
    matching_movie = ""
    for movie in valid_movies:
        clean_tran_movie = re.sub(r'\W+', '', tran_movies[tran_movie_index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            matching_movie = movie
            break
    if matching_movie != "":
        SCRIPT_PATH = "./data/scripts/" + transcripts[tran_movie_index]
        MOVIE_NAME = matching_movie
        get_metadata_json(MOVIE_NAME, SCRIPT_PATH)