In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import urllib.request
import json
import copy
from slugify import slugify

%matplotlib inline

In [2]:
credits = pd.read_csv('./data/movies.csv')
movie_titles_creds = np.array(credits["title"].tolist())
movies = pd.read_csv('./data/movies_data.csv')
movie_titles_movies = np.array(movies["title"].tolist())

gender_mapping = {0 : "Other", 1 : "Female", 2 : "Male"}

ethnicities = pd.read_csv('./data/ethnicelebs.csv', header = None)
actor_names = ethnicities[0].tolist()
actor_ethnicities = ethnicities[1].tolist()

races = pd.read_csv('./data/ethnicities_to_races.csv')
ethnicity_mapping = np.array(races["ethnicity"].tolist())
race_mapping = np.array(races["race"].tolist())

In [3]:
def parse_transcript(filename):

    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")
    line_list = []
    transcript = []
    characters = []
    characters2 = []

    text_file = open(filename, "r")
    lines = text_file.readlines()
    text_file.close()

    num_lines = 0;
    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "" and num_lines != 0:
            c = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            if (c in characters) and (c not in characters2):
                characters2.append(c)
            if c.isupper() and (not li.endswith(" POV")) and ("INT." not in l) and ("EXT." not in l) and ("--" not in l) and ("_" not in l) and ("- DAY" not in l) and ("INTERIOR" not in l) and ("CLOSE ON" != li) and ("CUT TO" != li) and ("EXTERIOR" not in l) and ("NSERT " not in l) and ("BACK TO " not in l) and ("ACTION " not in l) and ("OMITTED" not in l) and ('LATER THAT NIGHT -' not in l) and ("ANOTHER ANGLE" not in l) and ("IN THE CAR" not in l) and ("IN THE LOT" not in l) and ("ACROSS THE " not in l) and ("THE END" not in l) and ("END CREDITS" not in l) and ("FADE OUT" not in l) and (":" not in l) and ("!" not in l) and ("?" not in l) and ('"' not in l) and ("NEW ANGLE" != li) and ("CLOSEUP" not in l) and ("ANGLE ON TV" != li) and (not c.endswith(".")) and (c not in characters):
                characters.append(c)
        elif li != "": 
            num_lines = 1

    speaker = "";
    utterance = "";
    still_speaking = True
    second_time = False
    previous_spaces = 0

    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "": 
            spmatch = spaces_regex.search(l)
            spaces_number = len(spmatch.group(1))
            ch = re.sub(r'\([^()]*\)', '', re.sub(r'\[[^()]*\]', '', li)).strip(' \n\t\r')
            if ch == "" or ch.startswith("(") or ch.endswith(")"):
                ch = ""
            elif (("INT." in l or "EXT." in l) and utterance != "") or li == "THE END":
                transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                speaker = ""
                utterance = ""
                second_time = False 
            elif ch in characters2:
                if utterance != "" and speaker != ch and speaker != "":
                    second_time = False
                    transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                    utterance = ""
                elif speaker == ch:
                    second_time = True
                speaker = ch;
                still_speaking = True
                previous_spaces = 0
            elif still_speaking == True and speaker != "":
                if (spaces_number == previous_spaces or previous_spaces == 0):
                    utterance += " " + li
                    previous_spaces = spaces_number;
                second_time = False                
        elif utterance == "" and speaker != "":
            still_speaking = True
        elif utterance != "" and second_time == False:
            still_speaking = False
    return transcript

In [4]:
def get_lines(script):
    transcript = parse_transcript(script)
    line_dict = dict()
    
    for i in np.arange(len(transcript)):
        speaker = transcript[i]['speaker']
        line = transcript[i]['utterance']
        if speaker in line_dict.keys():
            line_dict[speaker] += [line]
        else:
            line_dict[speaker] = [line]
    return line_dict

In [5]:
def get_movie_metadata(name, script):
    metadata = dict()
    movie_num = np.where(movie_titles_movies == name)
    metadata["name"] = np.array(movies["title"].tolist())[movie_num][0]
    genres = json.loads(np.array(movies["genres"].tolist())[movie_num][0])
    metadata["genres"] = [x["name"].lower() for x in genres]
    fname = script[:len(script)-4]
    metadata["script"] = fname[fname.rfind('/')+1:].lower()
    metadata["id"] = int(np.array(movies["id"].tolist())[movie_num][0])
    release_year = np.array(movies["release_date"].tolist())[movie_num][0].split('-')[0]
    metadata["release_yr"] = release_year
    metadata["rating"] = str(np.array(movies["vote_average"].tolist())[movie_num][0])
    metadata["budget"] = str(np.array(movies["budget"].tolist())[movie_num][0])
    metadata["box_office"] = str(np.array(movies["revenue"].tolist())[movie_num][0])
    metadata["synopsis"] = np.array(movies["overview"].tolist())[movie_num][0]
    metadata["num_awards"] = 0
    
    poster_title = metadata["name"].lower()
    poster_title = poster_title.replace(":", "%3c")
    poster_title = poster_title.replace("&", "%26")
    poster_title = poster_title.replace("/", "%2f")
    poster_title = poster_title.replace(",", "%2c")
    poster_title = poster_title.replace("+", "%2b")
    poster_title = '+'.join(poster_title.split(' '))
    omdb_request = "http://omdbapi.com/?apikey=" + MY_KEY + "&t=" + poster_title + "&y=" + release_year
    
    try:
        with urllib.request.urlopen(omdb_request) as url:
            omdb_results = json.loads(url.read().decode())
        metadata["review_score"] = omdb_results["imdbRating"]
        metadata["poster_image_url"] = omdb_results["Poster"]
        metadata["imdb_url"] = "https://www.imdb.com/title/" + omdb_results["imdbID"]
    except:
        metadata["review_score"] = "N/A"
        metadata["poster_image_url"] = "N/A"
        metadata["imdb_url"] = "N/A"
        
    metadata["slug"] = re.sub(r'[-\s]+', '-', (re.sub(r'[^\w\s-]', '',metadata["name"]).strip().lower()))

    return metadata

In [6]:
def get_cast_and_crew(name):
    movie_num = np.where(movie_titles_creds == name)
    cast = np.array(credits["cast"].tolist())[movie_num][0]
    crew = np.array(credits["crew"].tolist())[movie_num][0]
    return cast, crew

In [7]:
def get_actor_metadata(cast):
    metadata = dict()
    
    char_list = json.loads(cast)
    for person in char_list:
        metadata[person['name']] = {'actor_id' : person['cast_id'], 'char_name' : person['character']}
    return metadata

In [8]:
def get_crew_metadata(crew):
    metadata = dict()
    crew_list = json.loads(crew)
    for person in crew_list:
        metadata[person['name']] = {'crew_id' : person['id'], 'job_name' : person['job']}
    return metadata

In [9]:
def hashfeatures(baby, B, FIX):
    v = np.zeros(B)
    for m in range(FIX):
        featurestring = "prefix" + baby[:m]
        v[hash(featurestring) % B] = 1
        featurestring = "suffix" + baby[-1*m:]
        v[hash(featurestring) % B] = 1
    return v

def name2features(filename, B=104729, FIX=5, LoadFile=True):
    """
    Output:
    X : n feature vectors of dimension B, (nxB)
    """
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    
    n = len(babynames)
    
    X = np.zeros((n, B))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], B, FIX)
    return X

def genTrainFeatures(dimension=128, fix=3):
    """
    function [x,y]=genTrainFeatures
    
    This function calls the python script "name2features.py" 
    to convert names into feature vectors and loads in the training data. 
    
    
    Output: 
    x: n feature vectors of dimensionality d [d,n]
    y: n labels (-1 = girl, +1 = boy)
    """
    
    Xgirls = name2features("./girls.train", B=dimension, FIX=fix)
    Xboys = name2features("./boys.train", B=dimension, FIX=fix)
    X = np.concatenate([Xgirls, Xboys])
    
    Y = np.concatenate([-np.ones(len(Xgirls)), np.ones(len(Xboys))])
    
    ii = np.random.permutation([i for i in range(len(Y))])
    
    return X[ii, :], Y[ii]

X,Y = genTrainFeatures(128)

def naivebayesPY(x,y):
    """
    function [pos,neg] = naivebayesPY(x,y);

    Computation of P(Y)
    Input:
        x : n input vectors of d dimensions (nxd)
        y : n labels (-1 or +1) (nx1)

    Output:
    pos: probability p(y=1)
    neg: probability p(y=-1)
    """
    
    y = np.concatenate([y, [-1,1]])
    n = len(y)
    unique_elements, counts_elements = np.unique(y, return_counts=True)
    neg = counts_elements[0] / n
    pos = counts_elements[1] / n
    return pos, neg

pos,neg = naivebayesPY(X,Y)

def naivebayesPXY(x,y):
    """
    function [posprob,negprob] = naivebayesPXY(x,y);
    
    Computation of P(X|Y)
    Input:
        x : n input vectors of d dimensions (nxd)
        y : n labels (-1 or +1) (nx1)
    
    Output:
    posprob: probability vector of p(x|y=1) (1xd)
    negprob: probability vector of p(x|y=-1) (1xd)
    """
    
    n, d = x.shape
    x = np.concatenate([x, np.ones((2,d))])
    y = np.concatenate([y, [-1,1]])
    n, d = x.shape

    x_pos = x[np.where(y==1)]
    x_neg = x[np.where(y==-1)]
    
    x_pos_sum = np.sum(x_pos, axis=0)
    x_neg_sum = np.sum(x_neg, axis=0)
    
    
    x_pos_sum_denom = np.sum(x_pos_sum)
    x_neg_sum_denom = np.sum(x_neg_sum)
    
    posprob = np.array([x_pos_sum/x_pos_sum_denom])
    negprob = np.array([x_neg_sum/x_neg_sum_denom])
    
    return posprob, negprob
    

posprob,negprob = naivebayesPXY(X,Y)

def naivebayes(x,y,xtest):
    """
    function logratio = naivebayes(x,y);
    
    Computation of log P(Y|X=x1) using Bayes Rule
    Input:
    x : n input vectors of d dimensions (nxd)
    y : n labels (-1 or +1)
    xtest: input vector of d dimensions (1xd)
    
    Output:
    logratio: log (P(Y = 1|X=xtest)/P(Y=-1|X=xtest))
    """
    
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    
    return (np.sum(xtest * log_theta_pos) + np.log(ppos) - (np.sum(xtest * log_theta_neg) + np.log(pneg)))

p = naivebayes(X,Y,X[0,:])

def naivebayesCL(x,y):
    """
    function [w,b]=naivebayesCL(x,y);
    Implementation of a Naive Bayes classifier
    Input:
    x : n input vectors of d dimensions (nxd)
    y : n labels (-1 or +1)

    Output:
    w : weight vector of d dimensions
    b : bias (scalar)
    """
    
    n, d = x.shape
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    
    w = log_theta_pos - log_theta_neg
    b = np.log(ppos) - np.log(pneg)
    
    return w,b
    
w,b = naivebayesCL(X,Y)

def classifyLinear(x,w,b=0):
    """
    function preds=classifyLinear(x,w,b)
    
    Make predictions with a linear classifier
    Input:
    x : n input vectors of d dimensions (nxd)
    w : weight vector (dx1)
    b : bias (scalar)
    
    Output:
    preds: predictions (1xn)
    """
    w = w.reshape(-1)
    class_raw = np.dot(w.T, x.T) + b
    result = np.array(np.sign(class_raw))
    np.place(result, result == 0, [-1])
    return result

DIMS = 128
X,Y = genTrainFeatures(DIMS)
w,b=naivebayesCL(X,Y)
error = np.mean(classifyLinear(X,w,b) != Y)

def classify_name(name):
    name = name.capitalize()
    xtest = name2features(name, B = DIMS, LoadFile = False)
    pred = classifyLinear(xtest,w,b)[0]
    if pred > 0:
        return "Male"
    else:
        return "Female"

In [10]:
def get_gender_dict(cast, lines):
    char_list = json.loads(cast)
    gender_dict = dict()
    classified_gender_dict = dict()
    
    for speaker in lines.keys():
        found_match = False
        for d in char_list:
            char = [x.lower() for x in d['character'].split(' ')]
            found_character = speaker.lower() == d['character'].lower() or speaker.lower() in char
            found_speaker = any(x == speaker.lower() for x in char) or any(x in speaker.lower() for x in char)
            found_match = found_character or found_speaker
            if found_match:
                overall_gender_dict = dict()
                char_gender = classify_name(d['character'])
                speaker_gender = classify_name(speaker)
                cast_gender = gender_mapping[d['gender']]
                
                if char_gender in overall_gender_dict.keys():
                    overall_gender_dict[char_gender] += 1
                else:
                    overall_gender_dict[char_gender] = 1
                if speaker_gender in overall_gender_dict.keys():
                    overall_gender_dict[speaker_gender] += 1
                else:
                    overall_gender_dict[speaker_gender] = 1
                if cast_gender in overall_gender_dict.keys():
                    overall_gender_dict[cast_gender] += 1
                else:
                    overall_gender_dict[cast_gender] = 1
                    
                overall_gender = max(overall_gender_dict.keys(), key=(lambda k: overall_gender_dict[k]))
                
                if cast_gender != "Other" and overall_gender != cast_gender:
                    overall_gender = cast_gender
                    
                gender_dict[d['character']] = speaker, d['name'], overall_gender, d['cast_id']
                char_list.remove(d)
                break
        if not found_match:
            classified_gender_dict[speaker] = speaker, 'N/A', classify_name(speaker), 'N/A'
            
    new_gender_dict = copy.deepcopy(gender_dict)
    new_gender_dict.update(classified_gender_dict)
                
    return gender_dict, new_gender_dict

In [11]:
def get_crew_gender_dict(crew):
    crew_list = json.loads(crew)
    gender_dict = dict()
    
    for member in crew_list:
        name = member['name']
        given_gender = gender_mapping[member['gender']]
        classified_gender = classify_name(name)
        gender = given_gender
        if gender == "Other":
            gender = classified_gender
        
        gender_dict[name] = gender
    
    return gender_dict

In [12]:
def get_ethnicity_dict(genders):
    ethnicity_dict = dict()
    for character in genders.keys():
        speaker, name, gender, actor_id = genders[character]
        actor = '-'.join(name.lower().split(' '))
        try:
            actor_ethnicity = actor_ethnicities[actor_names.index(actor)]
            if(type(actor_ethnicity) != str):
                actor_ethnicity = "N/A"
            ethnicity_dict[character] = speaker, name, actor_ethnicity
        except ValueError:
            pass
        
    return ethnicity_dict

In [13]:
def analyze_gender(lines, genders):
    by_line = dict()
    by_char = dict()
    
    total_lines = 0
    total_chars = len(genders.keys())
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        if gender in by_line.keys():
            by_line[gender] += num_lines
            by_char[gender] += 1
        else:
            by_line[gender] = num_lines
            by_char[gender] = 1
            
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
        
    try:
        by_line["Male"] = by_line["Male"]
    except KeyError:
        by_line["Male"] = 0
        
    try:
        by_line["Female"] = by_line["Female"]
    except KeyError:
        by_line["Female"] = 0
        
    try:
        by_line["Other"] = by_line["Other"]
    except KeyError:
        by_line["Other"] = 0
        
    try:
        by_char["Male"] = by_char["Male"]
    except KeyError:
        by_char["Male"] = 0
        
    try:
        by_char["Female"] = by_char["Female"]
    except KeyError:
        by_char["Female"] = 0
        
    try:
        by_char["Other"] = by_char["Other"]
    except KeyError:
        by_char["Other"] = 0
        
    by_line = {"Male" : by_line["Male"], "Female" : by_line["Female"], "Other" : by_line["Other"]}
    by_char = {"Male" : by_char["Male"], "Female" : by_char["Female"], "Other" : by_char["Other"]}
    
    return by_line, by_char

In [14]:
def analyze_crew_gender(crew_genders):
    total_crew = len(crew_genders.keys())
    by_member = dict()
    
    for member in crew_genders.keys():
        gender = crew_genders[member]
        
        if gender in by_member.keys():
            by_member[gender] += 1
        else:
            by_member[gender] = 1
            
    for member in by_member.keys():
        by_member[member] = round(by_member[member] / total_crew, 2)
        
    try:
        by_member["Male"] = by_member["Male"]
    except KeyError:
        by_member["Male"] = 0
        
    try:
        by_member["Female"] = by_member["Female"]
    except KeyError:
        by_member["Female"] = 0
        
    try:
        by_member["Other"] = by_member["Other"]
    except KeyError:
        by_member["Other"] = 0
        
    by_member = {"Male" : by_member["Male"], "Female" : by_member["Female"], "Other" : by_member["Other"]}
    
    return by_member

In [15]:
def analyze_ethnicity(lines, ethnicities):
    by_line = dict()
    by_char = dict()
    race_dict = dict()
    
    total_lines = 0
    total_chars = len(ethnicities.keys())
    
    for char in ethnicities.keys():
        speaker, name, ethnicity = ethnicities[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        char_ethnicities = re.findall(r'[a-zA-Z]+', ethnicity)
        char_ethnicities = [x for x in char_ethnicities if x[0].isupper()]
        char_race = set()
         
        for e in char_ethnicities:
            try:
                race_num = np.where(ethnicity_mapping == e)
                races = race_mapping[race_num]
                if len(races) > 0:
                    char_race.add(races[0])
            except Error:
                pass
        
        race_dict[char] = ", ".join(char_race)     
        
        
        for race in char_race:
            if race in by_line.keys():
                by_line[race] += num_lines
                by_char[race] += 1
            else:
                by_line[race] = num_lines
                by_char[race] = 1
                
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
            
    return by_line, by_char, race_dict

In [16]:
def analyze_screentime(lines, genders):
    total_lines = 0
    screentime_dict = dict()
    
    for char in genders.keys():
        speaker, name, _, _ = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        screentime_dict[name] = num_lines
    
    for name in screentime_dict.keys():
        screentime_dict[name] = round(screentime_dict[name] / total_lines, 2)
        
    return screentime_dict

In [17]:
def get_char_metadata(lines, genders, races, screen_time):
    
    metadata = dict()
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        race = "N/A"
        if char in races.keys():
            race = races[char]
        time = screen_time[name]
        metadata[name] = {"actor_id" : str(actor_id), "char_name" : speaker.capitalize(), "screen_time" : time, "race" : race, "gender" : gender}
    return metadata

In [18]:
def get_distr_metadata(g_line, g_char, g_crew, r_line, r_char):
    metadata = dict()
    
    metadata["gender_dist"] = {
        "by_movie": {k.lower(): v for k, v in g_char.items()},
        "by_line": {k.lower(): v for k, v in g_line.items()},
        "by_crew" : {k.lower(): v for k, v in g_crew.items()}}
    metadata["race_dist"] = {
        "by_movie": {k.lower(): v for k, v in r_char.items()},
        "by_line": {k.lower(): v for k, v in r_line.items()}
    }
    metadata["stereotype_dist"] = {"by_movie" : [["Stereotypical", 0], ["Not stereotypical", 1]],
                                   "by_line" : [["Stereotypical", 0], ["Not stereotypical", 1]]}
    
    return metadata

In [19]:
def get_metadata_json(movie, script_path):
    """
    writes movie json to ./data/parsed_scripts/[movie_slug].json
    """

    movie_metadata = get_movie_metadata(movie, script_path)    
    
    movie_cast, movie_crew = get_cast_and_crew(movie)
    actor_metadata = get_actor_metadata(movie_cast)
    crew_metadata = get_crew_metadata(movie_crew)
    
    line_dict = get_lines(script_path)    
    gender_dict, gender_dict_by_lines = get_gender_dict(movie_cast, line_dict)
    crew_gender_dict = get_crew_gender_dict(movie_crew)
    ethnicity_dict = get_ethnicity_dict(gender_dict)
    gender_by_line, gender_by_char = analyze_gender(line_dict, gender_dict_by_lines)
    gender_by_crew = analyze_crew_gender(crew_gender_dict)
    race_by_line, race_by_char, race_dict = analyze_ethnicity(line_dict, ethnicity_dict)
    screen_time = analyze_screentime(line_dict, gender_dict)
    
    char_metadata = get_char_metadata(line_dict, gender_dict, race_dict, screen_time)
    distribution_metadata = get_distr_metadata(gender_by_line, gender_by_char, gender_by_crew, race_by_line, race_by_char)
    
    metadata = {"movie_metadata" : movie_metadata, 
                "actor_metadata" : actor_metadata,
                "crew_metadata" : crew_metadata,
                "char_metadata" : char_metadata, 
                "distribution_metadata" : distribution_metadata}
    
    file = script_path[:-4]
    file = os.path.dirname(file).replace("/scripts", "/parsed_scripts") + "/%s.json" % movie_metadata["slug"]
    
    target_dir = os.path.dirname(file)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(file, 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)
    
    # also write to app data
    with open("../app/data/movies/%s.json" % movie_metadata["slug"], 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)

In [20]:
MY_KEY = "cef453b6"

transcripts = os.listdir("./data/scripts")

def parse_title(title):
    title = title[:-4].replace("-", " ")
    if title[-5:] == ", The":
        title = "The " + title[:-5] 
    return title

tran_movies = [parse_title(title) for title in transcripts]

with open('./data/movies.txt') as f:
    all_movies = f.read().splitlines()
    
valid_movies = set([])
for index in np.arange(len(all_movies)):
    for movie in all_movies:
        clean_tran_movie = re.sub(r'\W+', '', all_movies[index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            valid_movies.add(movie)

for tran_movie_index in np.arange(len(tran_movies)):
    matching_movie = ""
    for movie in valid_movies:
        clean_tran_movie = re.sub(r'\W+', '', tran_movies[tran_movie_index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            matching_movie = movie
            break
    if matching_movie != "":
        SCRIPT_PATH = "./data/scripts/" + transcripts[tran_movie_index]
        MOVIE_NAME = matching_movie
        get_metadata_json(MOVIE_NAME, SCRIPT_PATH)

KeyboardInterrupt: 