In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import urllib.request
import json
import copy
from slugify import slugify
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import csv
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from empath import Empath

%matplotlib inline

In [2]:
lexicon = Empath()

lexicon.create_category("fear", ["fear"], model="fiction")
lexicon.create_category("anger", ["anger"], model="fiction")
lexicon.create_category("sadness", ["sadness"], model="fiction")
lexicon.create_category("joy", ["joy"], model="fiction")
lexicon.create_category("disgust", ["disgust"], model="fiction")
lexicon.create_category("surprise", ["surprise"], model="fiction")
lexicon.create_category("shame", ["shame"], model="fiction")
lexicon.create_category("envy", ["envy"], model="fiction")
lexicon.create_category("love", ["love"], model="fiction")

emotion_categories = ["fear", "anger", "sadness", "joy", "disgust", "surprise", "shame", "envy", "love"]

["fear", "terror", "Fear", "panic", "desperation", "trepidation", "anguish", "apprehension", "uncertainty", "dread", "Fear", "fury", "fright", "rage", "despair", "guilt", "anger", "revulsion", "anxiety", "sadness", "pure_fear", "anxiousness", "helplessness", "fear", "unease", "sorrow", "grief", "elation", "hatred", "resentment", "bloodlust", "horror", "insecurity", "vulnerability", "pure_terror", "emotion", "malice", "longing", "remorse", "hostility", "uneasiness", "Terror", "regret", "slight_fear", "wariness", "aggression", "hopelessness", "determination", "loathing", "devastation", "exhilaration", "agitation", "pain", "hysteria", "madness", "bravado", "nervousness", "disappointment", "sudden_fear", "curiousity", "betrayal", "intensity", "Anxiety", "desire", "turmoil", "Shock", "shock", "weariness", "agony", "curiosity", "contempt", "emotions", "defiance", "paranoia", "outrage", "own_fear", "calmness", "so_much_fear", "humiliation", "Guilt", "entire_being", "whole_being", "bitterness"

In [3]:
credits = pd.read_csv('./data/movies.csv')
movie_titles_creds = np.array(credits["title"].tolist())
movies = pd.read_csv('./data/movies_data.csv')
movie_titles_movies = np.array(movies["title"].tolist())
metadata_csv = pd.read_csv('./data/metadata.csv')
movie_titles_metadata = np.array(metadata_csv["title"].tolist())
abs_path = os.path.abspath("mydir/myfile.txt")
new_path = abs_path[:abs_path.find('tools')] + 'app/data/movies/'

gender_mapping = {0 : "Other", 1 : "Female", 2 : "Male"}

ethnicities = pd.read_csv('./data/ethnicelebs.csv', header = None)
actor_names = ethnicities[0].tolist()
actor_ethnicities = ethnicities[1].tolist()

races = pd.read_csv('./data/ethnicities_to_races.csv')
ethnicity_mapping = np.array(races["ethnicity"].tolist())
race_mapping = np.array(races["race"].tolist())

In [4]:
def parse_transcript(filename):

    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")
    line_list = []
    transcript = []
    characters = []
    characters2 = []

    text_file = open(filename, "r")
    lines = text_file.readlines()
    text_file.close()

    num_lines = 0;
    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "" and num_lines != 0:
            c = re.sub(r'\([^()]*\)', '', li).strip(' \n\t\r')
            if (c in characters) and (c not in characters2):
                characters2.append(c)
            if c.isupper() and (not li.endswith(" POV")) and ("INT." not in l) and ("EXT." not in l) and ("--" not in l) and ("_" not in l) and ("- DAY" not in l) and ("INTERIOR" not in l) and ("CLOSE ON" != li) and ("CUT TO" != li) and ("EXTERIOR" not in l) and ("NSERT " not in l) and ("BACK TO " not in l) and ("ACTION " not in l) and ("OMITTED" not in l) and ('LATER THAT NIGHT -' not in l) and ("ANOTHER ANGLE" not in l) and ("IN THE CAR" not in l) and ("IN THE LOT" not in l) and ("ACROSS THE " not in l) and ("THE END" not in l) and ("END CREDITS" not in l) and ("FADE OUT" not in l) and (":" not in l) and ("!" not in l) and ("?" not in l) and ('"' not in l) and ("NEW ANGLE" != li) and ("CLOSEUP" not in l) and ("ANGLE ON TV" != li) and (not c.endswith(".")) and (c not in characters):
                characters.append(c)
        elif li != "": 
            num_lines = 1

    speaker = "";
    utterance = "";
    still_speaking = True
    second_time = False
    previous_spaces = 0

    for l in lines:
        li = l.strip(' \n\t\r')
        if li != "": 
            spmatch = spaces_regex.search(l)
            spaces_number = len(spmatch.group(1))
            ch = re.sub(r'\([^()]*\)', '', re.sub(r'\[[^()]*\]', '', li)).strip(' \n\t\r')
            if ch == "" or ch.startswith("(") or ch.endswith(")"):
                ch = ""
            elif (("INT." in l or "EXT." in l) and utterance != "") or li == "THE END":
                transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                speaker = ""
                utterance = ""
                second_time = False 
            elif ch in characters2:
                if utterance != "" and speaker != ch and speaker != "":
                    second_time = False
                    transcript.append({'speaker': speaker, 'utterance': re.sub(r'\[[^()]*\]', '', utterance.strip())})
                    utterance = ""
                elif speaker == ch:
                    second_time = True
                speaker = ch;
                still_speaking = True
                previous_spaces = 0
            elif still_speaking == True and speaker != "":
                if (spaces_number == previous_spaces or previous_spaces == 0):
                    utterance += " " + li
                    previous_spaces = spaces_number;
                second_time = False                
        elif utterance == "" and speaker != "":
            still_speaking = True
        elif utterance != "" and second_time == False:
            still_speaking = False
    return transcript

In [5]:
def get_lines(transcript):
    line_dict = dict()
    
    for i in np.arange(len(transcript)):
        speaker = transcript[i]['speaker']
        line = transcript[i]['utterance']
        if speaker in line_dict.keys():
            line_dict[speaker] += [line]
        else:
            line_dict[speaker] = [line]
    
    return line_dict

In [6]:
def get_reviews(imdb_id):
    REVIEWS_KEY = "b0d4c725e171d3cb40ded4d9ce2989b7"
    movie_id_request = "https://api.themoviedb.org/3/find/" + imdb_id + "?api_key=" + REVIEWS_KEY  + "&external_source=imdb_id"
    try:
        with urllib.request.urlopen(movie_id_request) as url:
            movie_id_results = json.loads(url.read().decode())
        movie_id = movie_id_results['movie_results'][0]['id']

    except:
        movie_id = "N/A"
    if movie_id != "N/A":
        review_request = "https://api.themoviedb.org/3/movie/" + str(movie_id) + "/reviews?api_key=" + REVIEWS_KEY
        try:
            with urllib.request.urlopen(review_request) as url:
                results = json.loads(url.read().decode())['results']
                reviews = [x['content'] for x in results]
                return reviews
        except:
            return []

In [7]:
def get_movie_metadata(name, script):
    metadata = dict()
    movie_num = np.where(movie_titles_movies == name)
    metadata_name = np.array(movies["title"].tolist())[movie_num][0]
    genres = json.loads(np.array(movies["genres"].tolist())[movie_num][0])
    metadata["genres"] = [x["name"].lower() for x in genres]
    fname = script[:len(script)-4]
    metadata["script"] = fname[fname.rfind('/')+1:].lower()
    metadata["id"] = int(np.array(movies["id"].tolist())[movie_num][0])
    release_year = np.array(movies["release_date"].tolist())[movie_num][0].split('-')[0]
    metadata["release_yr"] = release_year
    metadata["rating"] = str(np.array(movies["vote_average"].tolist())[movie_num][0])
    metadata["budget"] = str(np.array(movies["budget"].tolist())[movie_num][0])
    metadata["box_office"] = str(np.array(movies["revenue"].tolist())[movie_num][0])
    metadata["synopsis"] = np.array(movies["overview"].tolist())[movie_num][0]
    metadata["num_awards"] = 0
    
    try:
        num = np.where(movie_titles_metadata == metadata_name)
        metadata["name"] = np.array(metadata_csv["title"].tolist())[num][0]
        metadata["review_score"] = np.array(metadata_csv["review_score"].tolist())[num][0]
        metadata["poster_image_url"] = np.array(metadata_csv["poster"].tolist())[num][0]
        imdbID = np.array(metadata_csv["imdbID"].tolist())[num][0]
        metadata["imdb_url"] = "https://www.imdb.com/title/" + imdbID
        metadata["imdb_reviews"] = np.array(metadata_csv["reviews"].tolist())[num][0]
    
    except:
        metadata["name"] = metadata_name
        poster_title = metadata["name"].lower()
        poster_title = poster_title.replace(":", "%3c")
        poster_title = poster_title.replace("&", "%26")
        poster_title = poster_title.replace("/", "%2f")
        poster_title = poster_title.replace(",", "%2c")
        poster_title = poster_title.replace("+", "%2b")
        poster_title = '+'.join(poster_title.split(' '))
        omdb_request = "http://omdbapi.com/?apikey=" + MY_KEY + "&t=" + poster_title + "&y=" + release_year
        imdbID = ""
    
        try:
            with urllib.request.urlopen(omdb_request) as url:
                omdb_results = json.loads(url.read().decode())
            metadata["review_score"] = omdb_results["imdbRating"]
            metadata["poster_image_url"] = omdb_results["Poster"]
            imdbID = omdb_results["imdbID"]
            metadata["imdb_url"] = "https://www.imdb.com/title/" + imdbID
            metadata["imdb_reviews"] = get_reviews(imdbID)
        except:
            metadata["review_score"] = "N/A"
            metadata["poster_image_url"] = "N/A"
            metadata["imdb_url"] = "N/A"
            metadata["imdb_reviews"] = []
        
        with open("./data/metadata.csv", "a") as output:
            writer = csv.writer(output, lineterminator='\n')
            writer.writerow([metadata["name"], metadata["review_score"], metadata["poster_image_url"], imdbID, metadata["imdb_reviews"]])
        
    
    analyzer = SentimentIntensityAnalyzer()

    overall_compound = 0
    for review in metadata["imdb_reviews"]:
        vs = analyzer.polarity_scores(review)
        overall_compound += vs["compound"]
        
    if len(metadata["imdb_reviews"]) > 0:
        overall_compound = overall_compound / len(metadata["imdb_reviews"])
        if(overall_compound) >= 0.05:
            metadata["imdb_review_sentiment"] = "Positive"
        elif(overall_compound) < -0.05:
            metadata["imdb_review_sentiment"] = "Negative"
        else:
            metadata["imdb_review_sentiment"] = "Neutral"
    else:
        metadata["imdb_review_sentiment"] = "Neutral"

    metadata["slug"] = re.sub(r'[-\s]+', '-', (re.sub(r'[^\w\s-]', '',metadata["name"]).strip().lower()))

    return metadata

In [8]:
def get_cast_and_crew(name):
    movie_num = np.where(movie_titles_creds == name)
    cast = np.array(credits["cast"].tolist())[movie_num][0]
    crew = np.array(credits["crew"].tolist())[movie_num][0]
    return cast, crew

In [9]:
def get_actor_metadata(cast):
    metadata = dict()
    
    char_list = json.loads(cast)
    for person in char_list:
        metadata[person['name']] = {'actor_id' : person['cast_id'], 'char_name' : person['character']}
    return metadata

In [10]:
def get_crew_metadata(crew):
    metadata = dict()
    crew_list = json.loads(crew)
    for person in crew_list:
        metadata[person['name']] = {'crew_id' : person['id'], 'job_name' : person['job']}
    return metadata

In [11]:
def hashfeatures(baby, B, FIX):
    v = np.zeros(B)
    for m in range(FIX):
        featurestring = "prefix" + baby[:m]
        v[hash(featurestring) % B] = 1
        featurestring = "suffix" + baby[-1*m:]
        v[hash(featurestring) % B] = 1
    return v

def name2features(filename, B=104729, FIX=5, LoadFile=True):
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, B))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], B, FIX)
    return X

def genTrainFeatures(dimension=128, fix=3):
    Xgirls = name2features("./girls.train", B=dimension, FIX=fix)
    Xboys = name2features("./boys.train", B=dimension, FIX=fix)
    X = np.concatenate([Xgirls, Xboys])
    Y = np.concatenate([-np.ones(len(Xgirls)), np.ones(len(Xboys))])
    ii = np.random.permutation([i for i in range(len(Y))])
    return X[ii, :], Y[ii]
X,Y = genTrainFeatures(128)

def naivebayesPY(x,y):
    y = np.concatenate([y, [-1,1]])
    n = len(y)
    unique_elements, counts_elements = np.unique(y, return_counts=True)
    neg = counts_elements[0] / n
    pos = counts_elements[1] / n
    return pos, neg
pos,neg = naivebayesPY(X,Y)

def naivebayesPXY(x,y):
    n, d = x.shape
    x = np.concatenate([x, np.ones((2,d))])
    y = np.concatenate([y, [-1,1]])
    n, d = x.shape
    x_pos = x[np.where(y==1)]
    x_neg = x[np.where(y==-1)]
    x_pos_sum = np.sum(x_pos, axis=0)
    x_neg_sum = np.sum(x_neg, axis=0)
    x_pos_sum_denom = np.sum(x_pos_sum)
    x_neg_sum_denom = np.sum(x_neg_sum)
    posprob = np.array([x_pos_sum/x_pos_sum_denom])
    negprob = np.array([x_neg_sum/x_neg_sum_denom])
    return posprob, negprob
posprob,negprob = naivebayesPXY(X,Y)

def naivebayes(x,y,xtest):
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    return (np.sum(xtest * log_theta_pos) + np.log(ppos) - (np.sum(xtest * log_theta_neg) + np.log(pneg)))
p = naivebayes(X,Y,X[0,:])

def naivebayesCL(x,y):
    n, d = x.shape
    ppos, pneg = naivebayesPY(x,y)
    theta_pos, theta_neg = naivebayesPXY(x,y)
    log_theta_pos = np.log(theta_pos)
    log_theta_neg = np.log(theta_neg)
    w = log_theta_pos - log_theta_neg
    b = np.log(ppos) - np.log(pneg)
    return w,b
w,b = naivebayesCL(X,Y)

def classifyLinear(x,w,b=0):
    w = w.reshape(-1)
    class_raw = np.dot(w.T, x.T) + b
    result = np.array(np.sign(class_raw))
    np.place(result, result == 0, [-1])
    return result
DIMS = 128
X,Y = genTrainFeatures(DIMS)
w,b=naivebayesCL(X,Y)
error = np.mean(classifyLinear(X,w,b) != Y)

def classify_name(name):
    name = name.capitalize()
    xtest = name2features(name, B = DIMS, LoadFile = False)
    pred = classifyLinear(xtest,w,b)[0]
    if pred > 0:
        return "Male"
    else:
        return "Female"

In [12]:
def get_gender_dict(cast, lines):
    char_list = json.loads(cast)
    char_choices = [x['character'] for x in char_list]
    char_choices_dict = {idx: el for idx, el in enumerate(char_choices)}
    
    gender_dict = dict()
    gender_dict_by_lines = dict()
    
    some_counter = 0
    
    for speaker in lines.keys():
        some_counter += 1
        match = process.extractOne(speaker, char_choices_dict)
        if not (match is None) and match[1] != 0:
            match_index = match[2]
            final_match = char_list[match_index]
            char_choices_dict.pop(match_index, None)
            gender = gender_mapping[final_match['gender']]
            if gender == 'Other':
                overall_gender = dict()
                char_gender = classify_name(final_match['character'])
                speaker_gender = classify_name(speaker)
                if char_gender in overall_gender.keys():
                    overall_gender[char_gender] += 1
                else:
                    overall_gender[char_gender] = 1
                if speaker_gender in overall_gender.keys():
                    overall_gender[speaker_gender] += 1
                else:
                    overall_gender[speaker_gender] = 1
                gender = max(overall_gender.keys(), key=(lambda k : overall_gender[k]))
            if final_match['character'] not in gender_dict.keys():
                gender_dict[final_match['character']] = speaker, final_match['name'], gender, final_match['cast_id']
            else:
                gender_dict[final_match['character'] + str(some_counter)] = speaker, final_match['name'], gender, final_match['cast_id']
        else:
            if speaker not in gender_dict.keys() and speaker not in gender_dict_by_lines.keys():
                gender_dict_by_lines[speaker] = speaker, 'N/A', classify_name(speaker), "N/A"
            else:
                gender_dict_by_lines[speaker + str(some_counter)] = speaker, 'N/A', classify_name(speaker), "N/A"
                
    new_gender_dict = copy.deepcopy(gender_dict)
    new_gender_dict.update(gender_dict_by_lines)
    
    return gender_dict, new_gender_dict

In [13]:
def get_crew_gender_dict(crew):
    crew_list = json.loads(crew)
    gender_dict = dict()
    
    for member in crew_list:
        name = member['name']
        given_gender = gender_mapping[member['gender']]
        classified_gender = classify_name(name)
        gender = given_gender
        if gender == "Other":
            gender = classified_gender
        
        gender_dict[name] = gender
    
    return gender_dict

In [14]:
def get_ethnicity_dict(genders):
    ethnicity_dict = dict()
    for character in genders.keys():
        speaker, name, gender, actor_id = genders[character]
        actor = '-'.join(name.lower().split(' '))
        try:
            actor_ethnicity = actor_ethnicities[actor_names.index(actor)]
            if(type(actor_ethnicity) != str):
                actor_ethnicity = "N/A"
            ethnicity_dict[character] = speaker, name, actor_ethnicity
        except ValueError:
            pass
        
    return ethnicity_dict

In [15]:
def analyze_gender(lines, genders):
    by_line = dict()
    by_char = dict()
    
    total_lines = 0
    total_chars = len(genders.keys())
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        if gender in by_line.keys():
            by_line[gender] += num_lines
            by_char[gender] += 1
        else:
            by_line[gender] = num_lines
            by_char[gender] = 1
            
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
        
    try:
        by_line["Male"] = by_line["Male"]
    except KeyError:
        by_line["Male"] = 0
        
    try:
        by_line["Female"] = by_line["Female"]
    except KeyError:
        by_line["Female"] = 0
        
    try:
        by_line["Other"] = by_line["Other"]
    except KeyError:
        by_line["Other"] = 0
        
    try:
        by_char["Male"] = by_char["Male"]
    except KeyError:
        by_char["Male"] = 0
        
    try:
        by_char["Female"] = by_char["Female"]
    except KeyError:
        by_char["Female"] = 0
        
    try:
        by_char["Other"] = by_char["Other"]
    except KeyError:
        by_char["Other"] = 0
        
    by_line = {"Male" : by_line["Male"], "Female" : by_line["Female"], "Other" : by_line["Other"]}
    by_char = {"Male" : by_char["Male"], "Female" : by_char["Female"], "Other" : by_char["Other"]}
    
    return by_line, by_char

In [16]:
def analyze_empath(lines, genders_by_line, race_dict):
    
    speaker_to_gender = dict()
    races = set()
    speaker_to_race = dict()
    race_emotions = dict()
    race_categories = dict()
    gender_line_len = dict()
    race_line_len = dict()
    
    for name in genders_by_line.keys():
        speaker, _, gender, _ = genders_by_line[name]
        speaker_to_gender[speaker] = gender
    
    for char in race_dict.keys():
        speaker, race = race_dict[char]
        races.add(race)
        speaker_to_race[speaker] = race
    
    for race in races:
        race_emotions[race] = lexicon.analyze("", categories=emotion_categories)
        race_categories[race] = lexicon.analyze("")
        race_line_len[race] = []
    
    gender_emotions = {"Female" : {'anger': 0.0,
                                   'disgust': 0.0,
                                   'envy': 0.0,
                                   'fear': 0.0,
                                   'joy': 0.0,
                                   'love': 0.0,
                                   'sadness': 0.0,
                                   'shame': 0.0,
                                   'surprise': 0.0},
                       "Male" : {'anger': 0.0,
                                   'disgust': 0.0,
                                   'envy': 0.0,
                                   'fear': 0.0,
                                   'joy': 0.0,
                                   'love': 0.0,
                                   'sadness': 0.0,
                                   'shame': 0.0,
                                   'surprise': 0.0},
                       "Other" : {'anger': 0.0,
                                   'disgust': 0.0,
                                   'envy': 0.0,
                                   'fear': 0.0,
                                   'joy': 0.0,
                                   'love': 0.0,
                                   'sadness': 0.0,
                                   'shame': 0.0,
                                   'surprise': 0.0}}
    
    gender_line_len = {"Female" : [], "Male" : [], "Other" : []}
    
    for speaker in lines.keys():
        for line in lines[speaker]:
            line_emotions = lexicon.analyze(line, categories=emotion_categories)
            line_categories = lexicon.analyze(line)
            line_len = len(set(re.findall(r'\w+', line)))
            gender = speaker_to_gender[speaker]
            for emotion in emotion_categories:
                gender_emotions[gender][emotion] += line_emotions[emotion]
                gender_line_len[gender] += [line_len]
                if speaker in speaker_to_race.keys():
                    race_emotions[race][emotion] += line_emotions[emotion]
                    race_line_len[race] += [line_len]
            for category in line_categories.keys():
                if speaker in speaker_to_race.keys():
                    race_categories[race][category] += line_categories[category]
    
    for gender in gender_emotions.keys():
        curr_gender_line_len = gender_line_len[gender]
        if len(curr_gender_line_len) > 0:
            gender_line_len[gender] = float(sum(curr_gender_line_len))/len(curr_gender_line_len)
        else:
            gender_line_len[gender] = 0
        total_emotion = 0
        for emotion in emotion_categories:
            total_emotion += gender_emotions[gender][emotion]
        if total_emotion > 0:
            for emotion in emotion_categories:
                gender_emotions[gender][emotion] = gender_emotions[gender][emotion] / total_emotion

    

    
    for race in race_emotions.keys():
        curr_race_line_len = race_line_len[race]
        if len(curr_race_line_len) > 0:
            race_line_len[race] = float(sum(curr_race_line_len))/len(curr_race_line_len)
        else:
            race_line_len[race] = 0
        total_emotion = 0
        for emotion in emotion_categories:
            total_emotion += race_emotions[race][emotion]
        if total_emotion > 0:
            for emotion in emotion_categories:
                race_emotions[race][emotion] = race_emotions[race][emotion] / total_emotion
    
    for k in race_categories.keys():
        race_categories[k] = (sorted(race_categories[k], key = race_categories[k].get)[-10:])
        race_categories[k].reverse()
            
    return gender_emotions, race_emotions, race_categories, gender_line_len, race_line_len

In [17]:
def analyze_gender_categories(lines, genders_by_line):
    
    empty_dict_mm = lexicon.analyze("")
    empty_dict_ff = lexicon.analyze("")
    empty_dict_mf = lexicon.analyze("")

    
    speaker_to_gender = dict()
    
    for name in genders_by_line.keys():
        speaker, _, gender, _ = genders_by_line[name]
        speaker_to_gender[speaker] = gender
    
    conv_categories = {"Male/Male" : empty_dict_mm, "Female/Female" : empty_dict_ff, "Male/Female" : empty_dict_mf}
    
    for line_ind in np.arange(len(lines) - 1):
        line1 = lines[line_ind]
        line2 = lines[line_ind + 1]
        analysis1 = lexicon.analyze(line1['utterance'])
        analysis2 = lexicon.analyze(line2['utterance'])
        g1 = speaker_to_gender[line1['speaker']]
        g2 = speaker_to_gender[line2['speaker']]
        if g1 == "Male" and g2 == "Male":
            category = "Male/Male"
        elif g1 == "Female" and g2 == "Female":
            category = "Female/Female"
        else:
            category = "Male/Female"
        for k in empty_dict_mm.keys():
            conv_categories[category][k] += analysis1[k] + analysis2[k]
   
    for k in conv_categories.keys():
        conv_categories[k] = (sorted(conv_categories[k], key = conv_categories[k].get)[-10:])
        conv_categories[k].reverse()
    return conv_categories

In [18]:
def analyze_crew_gender(crew_genders):
    total_crew = len(crew_genders.keys())
    by_member = dict()
    
    for member in crew_genders.keys():
        gender = crew_genders[member]
        
        if gender in by_member.keys():
            by_member[gender] += 1
        else:
            by_member[gender] = 1
            
    for member in by_member.keys():
        by_member[member] = round(by_member[member] / total_crew, 2)
        
    try:
        by_member["Male"] = by_member["Male"]
    except KeyError:
        by_member["Male"] = 0
        
    try:
        by_member["Female"] = by_member["Female"]
    except KeyError:
        by_member["Female"] = 0
        
    try:
        by_member["Other"] = by_member["Other"]
    except KeyError:
        by_member["Other"] = 0
        
    by_member = {"Male" : by_member["Male"], "Female" : by_member["Female"], "Other" : by_member["Other"]}
    
    return by_member

In [19]:
def analyze_ethnicity(lines, ethnicities):
    by_line = dict()
    by_char = dict()
    race_dict = dict()
    
    total_lines = 0
    total_chars = len(ethnicities.keys())
    
    for char in ethnicities.keys():
        speaker, name, ethnicity = ethnicities[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        
        char_ethnicities = re.findall(r'[a-zA-Z]+', ethnicity)
        char_ethnicities = [x for x in char_ethnicities if x[0].isupper()]
        char_race = set()
         
        for e in char_ethnicities:
            try:
                race_num = np.where(ethnicity_mapping == e)
                races = race_mapping[race_num]
                if len(races) > 0:
                    char_race.add(races[0])
            except Error:
                pass
        
        race_dict[char] = speaker, ", ".join(char_race)     
        
        
        for race in char_race:
            if race in by_line.keys():
                by_line[race] += num_lines
                by_char[race] += 1
            else:
                by_line[race] = num_lines
                by_char[race] = 1
                
    for char in by_line.keys():
        by_line[char] = round(by_line[char] / total_lines, 2)
        by_char[char] = round(by_char[char] / total_chars, 2)
            
    return by_line, by_char, race_dict

In [20]:
def find_next_female_speaker(transcript, genders, start_loc):
    #genders = bechdel_genders in analyze_bechdel
    #start_loc = where in transcript to start looking
    #returns index of transcript with next female speaker if exists and -1 otherwise
    
    if start_loc >= len(transcript):
        return -1
    
    else:
        curr_transcript = transcript[start_loc:]
        offset = 0
        while offset < len(curr_transcript):
            curr_speaker = curr_transcript[offset]['speaker']
            curr_gender = genders[curr_speaker]
            if curr_gender != "Female":
                offset += 1
            else:
                break
        if offset == len(curr_transcript):
            return -1
        else:
            return start_loc + offset

In [21]:
def find_female_conversation(transcript, genders, start_loc):
    
    if start_loc >= len(transcript):
        return [], -1
    curr_transcript = transcript[start_loc:]
    
    index = 0
    conversation = []
    names = set()
    while index < len(curr_transcript):
        curr_speaker = curr_transcript[index]['speaker']
        curr_gender = genders[curr_speaker]
        if curr_gender == "Female":
            names.add(curr_speaker)
        if curr_gender != "Female":
            if len(conversation) <= 1 or len(names) < 2:
                return [], start_loc + index
            else:
                return conversation, start_loc + index
        else:
            conversation.append(curr_transcript[index])
            index += 1
    
    if len(conversation) <= 1:
        return [], start_loc + index
    else:
        return conversation, start_loc + index

In [22]:
def analyze_conversation(conversation, genders):
    
    genders_by_speaker = dict()
    genders_by_character = dict()
    
    for name in genders.keys():
        speaker, _, gender, _ = genders[name]
        genders_by_speaker[speaker] = gender
        genders_by_character[name] = gender
        
    merged_dict = genders_by_speaker.copy()
    merged_dict.update(genders_by_character)
    
    name_choices = list(merged_dict.keys())
    
    male_words = ['he', 'him', 'his', 'himself', 'boy', 'boys', 'man', 'men', 'husband', 'son', 'father', 'brother', 'dad']
    all_words = set() 
    for line in conversation:
        words = set(re.findall(r'\w+', line['utterance']))
        all_words.update(words)
        
    word_list = list(all_words)
    uppercase_words = [x for x in word_list if len(x) > 0 and x[0].isupper()]
    lowercase_words = [x.lower() for x in word_list]
    
    uses_pronouns = any(x in male_words for x in lowercase_words)
    uses_names = False
    
    for word in uppercase_words:
        match = process.extractOne(word, name_choices)
        if not match is None:
            speaker, _ = match
            name_choices.remove(speaker)
            if speaker in genders_by_speaker.keys():
                gender = genders_by_speaker[speaker]
            else:
                gender = genders_by_character[speaker]
        else:
            gender = classify_name(word)
            has_male_markers = any(x.lower() in ['man', 'boy', 'men', 'boys', 'guy', 'guys'] for x in word.split(' '))
            has_female_markers = any(x.lower() in ['woman', 'girl', 'women', 'girls'] for x in speaker.split(' '))
            if gender == 'Male' or (has_male_markers and not has_female_markers):
                gender = 'Male'
        
        if gender == 'Male':
            uses_names = True
            break
        
    return not (uses_pronouns or uses_names)

In [23]:
def analyze_bechdel(transcript, genders):
    
    bechdel_genders = dict()
    
    for g in genders.keys():
        speaker, _, gender, _ = genders[g]
        bechdel_genders[speaker] = gender
        
        
    if(len(transcript) == 0):
        return dict(), False
    
    else:
        index = find_next_female_speaker(transcript, bechdel_genders, 0)
        while(index != -1 and index < len(transcript)):
            conversation, end_index = find_female_conversation(transcript, bechdel_genders, index)
            end_index += 1
            if len(conversation) == 0:
                index = find_next_female_speaker(transcript, bechdel_genders, end_index)
            else:
                valid_conversation = analyze_conversation(conversation, genders)
                if valid_conversation:
                    return conversation, True
                else:
                    index = find_next_female_speaker(transcript, bechdel_genders, end_index)
        return dict(), False

In [24]:
def analyze_screentime(lines, genders):
    total_lines = 0
    screentime_dict = dict()
    
    for char in genders.keys():
        speaker, name, _, _ = genders[char]
        num_lines = len(lines[speaker])
        total_lines += num_lines
        screentime_dict[name] = num_lines
    
    for name in screentime_dict.keys():
        screentime_dict[name] = round(screentime_dict[name] / total_lines, 2)
        
    return screentime_dict

In [25]:
def get_char_metadata(lines, genders, races, screen_time):
    
    metadata = dict()
    
    for char in genders.keys():
        speaker, name, gender, actor_id = genders[char]
        race = "N/A"
        if char in races.keys():
            speaker, race = races[char]
        time = screen_time[name]
        metadata[name] = {"actor_id" : str(actor_id), "char_name" : speaker.capitalize(), "screen_time" : time, "race" : race, "gender" : gender}
    return metadata

In [26]:
def get_distr_metadata(g_line, g_char, g_crew, r_line, r_char):
    metadata = dict()
    
    metadata["gender_dist"] = {
        "by_movie": {k.lower(): v for k, v in g_char.items()},
        "by_line": {k.lower(): v for k, v in g_line.items()},
        "by_crew" : {k.lower(): v for k, v in g_crew.items()}}
    metadata["race_dist"] = {
        "by_movie": {k.lower(): v for k, v in r_char.items()},
        "by_line": {k.lower(): v for k, v in r_line.items()}
    }
    metadata["stereotype_dist"] = {"by_movie" : [["Stereotypical", 0], ["Not stereotypical", 1]],
                                   "by_line" : [["Stereotypical", 0], ["Not stereotypical", 1]]}
    
    return metadata

In [27]:
def get_bechdel_metadata(transcript, genders):
    metadata = dict()
    
    conversation, passes = analyze_bechdel(transcript, genders)
    metadata["passes"] = passes
    metadata["conversation"] = conversation
    
    return metadata

In [28]:
def get_empath_metadata(gender_emotions, gender_categories, race_emotions, race_categories):
    metadata = dict()
    metadata["gender_emotion_metadata"] = {k.lower(): v for k, v in gender_emotions.items()}
    metadata["gender_category_metadata"] = {k.lower(): v for k, v in gender_categories.items()}
    metadata["race_emotion_metadata"] = {k.lower(): v for k, v in race_emotions.items()}
    metadata["race_category_metadata"] = {k.lower(): v for k, v in race_categories.items()}

    return metadata

In [29]:
def get_line_len_metadata(gender, race):
    metadata = dict()
    metadata["average_by_gender"] = {k.lower(): v for k, v in gender.items()}
    metadata["average_by_race"] = {k.lower(): v for k, v in race.items()}

    return metadata

In [30]:
def get_metadata_json(movie, script_path):
    """
    writes movie json to ./data/parsed_scripts/[movie_slug].json
    """

    movie_metadata = get_movie_metadata(movie, script_path)    
    
    movie_cast, movie_crew = get_cast_and_crew(movie)
    actor_metadata = get_actor_metadata(movie_cast)
    crew_metadata = get_crew_metadata(movie_crew)
    transcript = parse_transcript(script_path)
    line_dict = get_lines(transcript)
    gender_dict, gender_dict_by_lines = get_gender_dict(movie_cast, line_dict)
    
    crew_gender_dict = get_crew_gender_dict(movie_crew)
    ethnicity_dict = get_ethnicity_dict(gender_dict)
    gender_by_line, gender_by_char = analyze_gender(line_dict, gender_dict_by_lines)
    gender_by_crew = analyze_crew_gender(crew_gender_dict)
    race_by_line, race_by_char, race_dict = analyze_ethnicity(line_dict, ethnicity_dict)
    screen_time = analyze_screentime(line_dict, gender_dict)
    gender_emotions, race_emotions, race_categories, gender_line_len, race_line_len = analyze_empath(line_dict, gender_dict_by_lines, race_dict)
    gender_categories = analyze_gender_categories(transcript, gender_dict_by_lines)


    char_metadata = get_char_metadata(line_dict, gender_dict, race_dict, screen_time)
    distribution_metadata = get_distr_metadata(gender_by_line, gender_by_char, gender_by_crew, race_by_line, race_by_char)
    
    bechdel_metadata = get_bechdel_metadata(transcript, gender_dict_by_lines)
    empath_metadata = get_empath_metadata(gender_emotions, gender_categories, race_emotions, race_categories)
    line_len_metadata = get_line_len_metadata(gender_line_len, race_line_len)
    
    
    metadata = {"movie_metadata" : movie_metadata, 
                "actor_metadata" : actor_metadata,
                "crew_metadata" : crew_metadata,
                "char_metadata" : char_metadata, 
                "distribution_metadata" : distribution_metadata,
                "bechdel_metadata" : bechdel_metadata,
                "empath_metadata" : empath_metadata,
                "line_length_metadata" : line_len_metadata
               }
    
    file = script_path[:-4]
    file = os.path.dirname(file).replace("/scripts", "/parsed_scripts") + "/%s.json" % movie_metadata["slug"]
    
    target_dir = os.path.dirname(file)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(file, 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)
    
    # also write to app data
    with open("../app/data/movies/%s.json" % movie_metadata["slug"], 'w+') as outfile:
        json.dump(metadata, outfile, indent=4, sort_keys=True)

In [33]:
MY_KEY = "8b0aeb00"
REVIEWS_KEY = "b0d4c725e171d3cb40ded4d9ce2989b7"
transcripts = os.listdir("./data/scripts")

def parse_title(title):
    title = title[:-4].replace("-", " ")
    if title[-5:] == ", The":
        title = "The " + title[:-5] 
    return title

tran_movies = [parse_title(title) for title in transcripts]

with open('./data/movies.txt') as f:
    all_movies = f.read().splitlines()
    
valid_movies = set([])
for index in np.arange(len(all_movies)):
    for movie in all_movies:
        clean_tran_movie = re.sub(r'\W+', '', all_movies[index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            valid_movies.add(movie)

multiple_matches = []
removed_movies = set()

counter = 0

for tran_movie_index in np.arange(len(tran_movies)):
    
    matching_movie = set()

    for movie in valid_movies:
        
        clean_tran_movie = re.sub(r'\W+', '', tran_movies[tran_movie_index]).lower()
        clean_cred_movie = re.sub(r'\W+', '', movie).lower()
        
        if clean_tran_movie == clean_cred_movie:
            
            matching_movie = set()
            matching_movie.add(movie)
            break
            
        elif clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            
            matching_movie.add(movie)

    if len(matching_movie) == 1:
        
        script_path = "./data/scripts/" + transcripts[tran_movie_index]
        movie_name = matching_movie.pop()
        removed_movies.add(movie_name)
        get_metadata_json(movie_name, script_path)

    if len(matching_movie) > 1:
        
        script_path = "./data/scripts/" + transcripts[tran_movie_index]
        multiple_matches = multiple_matches + [(script_path, matching_movie, clean_tran_movie)]

for i in np.arange(len(multiple_matches)):
    
    script_path, movie_name, script_name = multiple_matches[i]
    movie_name = list(movie_name - removed_movies)
    script_name_len = len((script_path[15:])[:-4])
    final_movie = [movie for movie in movie_name if len(movie) >= script_name_len]
    
    if len(final_movie) != 0:
        
        match = process.extractOne(script_name, final_movie)
    
        if not match is None:
        
            final_movie = match[0]
            removed_movies.add(final_movie)
            get_metadata_json(final_movie, script_path)
            
        else:
            
            final_movie = final_movie[0]
            get_metadata_json(final_movie, script_path)            

    else:
        
        match = process.extractOne(script_name, movie_name)
        
        if not match is None:
            
            final_movie = match[0]
            removed_movies.add(final_movie)
            get_metadata_json(final_movie, script_path)  
            
        else:
            
            if len(movie_name) > 0:
                
                final_movie = min(movie_name, key=len)
                removed_movies.add(final_movie)
                get_metadata_json(final_movie, script_path)            

{'Female': [22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 11, 11, 11, 11, 11, 11, 32, 32, 32, 32, 32, 32, 32, 32, 32, 9, 9, 9, 9, 9, 9, 9, 9, 9, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 12, 12, 12, 12, 12, 12, 12, 12, 12, 26, 26, 26, 26, 26, 26, 26, 26, 26, 13, 13, 13, 13, 13, 13, 13, 13, 13, 4, 4, 4, 4, 4, 4, 4, 4, 4, 23, 23, 23, 23, 23, 23, 23, 23, 23, 9, 9, 9, 9, 9, 9, 9, 9, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 36, 36, 36, 36, 36, 36, 36, 36, 36, 26, 26, 26, 26, 26, 26, 26, 26, 26, 8, 8, 8, 8, 8, 8, 8, 8, 8, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 27, 27, 27, 27, 27, 27, 27, 27, 27, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12

{'Female': [31, 31, 31, 31, 31, 31, 31, 31, 31, 39, 39, 39, 39, 39, 39, 39, 39, 39, 13, 13, 13, 13, 13, 13, 13, 13, 13, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 13, 13, 13, 13, 13, 13, 13, 13, 13, 3, 3, 3, 3, 3, 3, 3, 3, 3, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 7, 7, 7, 7, 7, 7, 7, 7, 7, 22, 22, 22, 22, 22, 22, 22, 22, 22, 4, 4, 4, 4, 4, 4, 4, 4, 4, 56, 56, 56, 56, 56, 56, 56, 56, 56, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 59, 59, 59, 59, 59, 59, 59, 59, 59, 6, 6, 6, 6, 6, 6, 6, 6, 6, 19, 19, 19, 19, 19, 19, 19, 19, 19, 6, 6, 6, 6, 6, 6, 6, 6, 6, 43, 43, 43, 43, 43, 43, 43, 43, 43, 47, 47, 47, 47, 47, 47, 47, 47, 47, 4, 4, 4, 4, 4, 4, 4, 4, 4, 118, 118, 118, 118, 118, 118, 118, 118, 118, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 15, 15, 15, 15, 15, 15, 15, 15, 15, 34, 34, 34, 34, 34, 34, 34, 34, 34, 19, 19, 19, 19, 



{'Female': [0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 25, 25, 25, 25, 25, 25, 25, 25, 25, 27, 27, 27, 27, 27, 27, 27, 27, 27, 17, 17, 17, 17, 17, 17, 17, 17, 17, 12, 12, 12, 12, 12, 12, 12, 12, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 22, 22, 22, 22, 22, 22, 22, 22, 22, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 34, 34, 34, 34, 34, 34, 34, 34, 34, 83, 83, 83, 83, 83, 83, 83, 83, 83, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 11, 11, 11, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 12, 12, 12, 12, 12, 12

{'Female': [7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 19, 19, 19, 19, 19, 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 22, 22, 22, 22, 22, 22, 22, 22, 22, 8, 8, 8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11, 11, 11, 11, 11, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 8, 8, 8, 8, 8, 8, 8, 8, 8, 22, 22, 22, 22, 22, 22, 22, 22, 22, 5, 5, 5, 5, 5, 5, 5, 5, 5, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1, 1, 1, 1, 1, 1, 1, 1, 1

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/lapra/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-33-23ac4635f166>", line 53, in <module>
    get_metadata_json(movie_name, script_path)
  File "<ipython-input-30-9a0565d35efa>", line 22, in get_metadata_json
    gender_categories = analyze_gender_categories(transcript, gender_dict_by_lines)
  File "<ipython-input-17-ddf5d99e35cc>", line 20, in analyze_gender_categories
    analysis2 = lexicon.analyze(line2['utterance'])
  File "/home/lapra/.local/lib/python3.5/site-packages/empath/core.py", line 46, in analyze
    for t in self.cats[k]: invcats[t].append(k)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/lapra/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
    stb = value._re

KeyboardInterrupt: 