# Nomination and Predication

Detecting how actors are named in a text and what is attributed to them.  
Detect discrimination by analysing the nomination and predication.

## Orgniastional Part

In [1]:
# notebook imports

# general machine learning imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# spacy packages
import spacy
from spacy import displacy
from spacy import tokenizer
import coreferee
from spacytextblob.spacytextblob import SpacyTextBlob

In [2]:
# initialise spacy
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('merge_entities')
nlp.add_pipe('coreferee')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x1c2c210b650>

In [4]:
# load text data
with open('text/Test_Text_I.txt', 'r') as f:
    text_one = f.read()    
with open('text/Test_Text_II.txt', 'r') as f:
    text_two = f.read()    
with open('text/Test_Text_III.txt', 'r') as f:
    text_three = f.read()
with open('text/Hillary_Clinton.txt', 'r') as f:
    hillary = f.read()
with open('text/Bill_Clinton.txt', 'r') as f:
    bill = f.read()

# convert to SpaCy format
text_one = nlp(text_one)
text_two = nlp(text_two)
text_three = nlp(text_three)
hillary = nlp(hillary)
bill = nlp(bill)

## Nomination

SpaCy automaticaly performes tokenisation, dependency parsing, part of speech tagging and named entity recognition.  
Therefore, we only need to identify the entities that represent actors in the text and co-reference all passages that mention the actors.

In [5]:
# nomination functions

# general variables
generic_woman = ['woman', 'women', 'mother', 'mom', 'grandmother', 'grandma', 'lady', 'ladies']
generic_man = ['man', 'men', 'father', 'dad', 'grandfather', 'granddad', 'gentleman', 'gentlemen']
# https://en.wikipedia.org/wiki/Category:Pejorative_terms_for_women
# https://en.wikipedia.org/wiki/Category:Pejorative_terms_for_men
# https://genderkit.org.uk/slurs/
# https://en.wiktionary.org/wiki/Category:English_swear_words
abusive_woman = ['battle-axe', 'bimbo', 'bitch', 'boseulachi', 'cougar',
                     'crone', 'cunt', 'dyke', 'feminazi', 'floozie', 'hag',
                     'harpy', 'moll', 'nakusha', 'shiksa', 'shrew', 'skintern',
                     'slut', 'spinster', 'trollop ', 'twat', 'virago', 'WAG',
                     'whore', 'skank']
abusive_man = ['cock', 'cuckold', 'dick', 'fop', 'fag', 'himbo','lothario',
                   'mamil', 'manlet', 'motherfucker','neckbeard', 'prick',
                   'incel', 'pansy', 'wanker', 'bastard', 'pussy', 'bugger', 'dickhead']
abusive_trans = ['tranny', 'hermaphrodite', 'shemale', 'heshe',
                    'chick-with-a-dick', 'transgenderist']
abusive_general = ['arse', 'arsehead', 'arsehole', 'ass', 'asshole', 'cocksucker',
                      'fatherfucker', 'fucker', 'motherfucker', 'pigfucker',
                      'sisterfucker', 'brotherfucker', 'turd']

# returns all named entities that are persons and generic names
def get_actors(text):
    actor_dict = {}
    for token in text:
        if token.dep_ == 'compound' and token.head.ent_type_ == 'PERSON':
            if token.head.text in actor_dict:
                actor_dict[token.head.text].append(token)
            else: actor_dict[token.head.text] = [token]
        elif token.ent_type_ == 'PERSON' and "'" not in token.text:
            if token.text in actor_dict:
                actor_dict[token.text].append(token)
            else: actor_dict[token.text] = [token]
    return actor_dict



# scan actor list if key is part of another actor
def combine_names(actor_dict):
    flagged_keys = []

    for key in actor_dict:
        for second_key in actor_dict:
            if key != second_key:
                if key in second_key:
                    actor_dict[second_key].extend(actor_dict[key])
                    flagged_keys.append(key)
    for key in flagged_keys:
        if key in actor_dict:
            del actor_dict[key]
    return actor_dict


def get_generic_names(text, actor_dict):
    generic_names = generic_woman + generic_man + abusive_woman + abusive_man + abusive_trans
    for token in text:
        if token.text.lower() in generic_names:
                if token.text in actor_dict:
                    actor_dict[token.text.lower()].append(token)
                else: actor_dict[token.text.lower()] = [token]
    return actor_dict


# given a text, extract the pronouns and gender of each actor
def get_pronouns_gender(knowledgebase, text):
    woman = ['she', 'her', 'hers', 'herself']
    man = ['he', 'his', 'him', 'himself']
    # https://uwm.edu/lgbtrc/support/gender-pronouns/
    nonBinary = ['they', 'them', 'thier', 'thiers', 'themself',
                  'zie', 'zim', 'zir', 'zis', 'zieself',
                  'sie', 'hir', 'hirs', 'hirself',
                  'ey', 'em', 'eir', 'eirs', 'eirself',
                  've', 'ver', 'vis', 'vers', 'verself',
                  'tey', 'ter', 'tem', 'ters', 'terself',
                  'e', 'emself',
                  'fae', 'faer', 'faers', 'faerself',
                  'ae', 'aer', 'aers', 'aerself',
                  'per', 'pers', 'perself',
                  'xe', 'xem', 'xyr', 'xyrs', 'xemself',
                  'ze', 'hir', 'hirs', 'hirself']
    knowledgebase['pronoun'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['pronoun_text'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['gender'] = 'unknown'
    # iterate over all pronuns in text
    for token in text:
        if token.pos_ == 'PRON':
           # if pronoun check if it can be resolved to an actor
            resolved_actor = text._.coref_chains.resolve(text[token.i])
            if resolved_actor is not None and len(resolved_actor) == 1:
                for index, value in knowledgebase['nomination'].items():
                    if resolved_actor[0] in value:
                        knowledgebase['pronoun'][index].append(token)
                        knowledgebase['pronoun_text'][index].append(token.text.lower())
                        break
    # resolve pronouns to one gender
    # assign woman or man if majority of pronouns referes to one gender
    # at least 5 pronouns are neccessary
    majorety_percent = 0.7
    for index, value in knowledgebase['pronoun_text'].items():
        if len(knowledgebase['pronoun_text'][index]) > 4:
            woman_count = 0
            man_count = 0
            for pronoun in value:
                if pronoun in woman:
                    woman_count += 1
                elif pronoun in man:
                    man_count += 1
            if woman_count != 0:
                if woman_count / len(knowledgebase['pronoun_text'][index]) >= majorety_percent:
                    knowledgebase['gender'][index] = 'woman'
            if man_count != 0 :
                if man_count / len(knowledgebase['pronoun_text'][index]) >= majorety_percent:
                    knowledgebase['gender'][index] = 'man'
            if set(value).issubset(set(nonBinary)):
                knowledgebase['gender'][index] = 'non-binary'
        elif len(knowledgebase['pronoun_text'][index]) != 0:
            if set(value).issubset(set(woman)):
                knowledgebase['gender'][index] = 'woman'
            if set(value).issubset(set(man)):
                knowledgebase['gender'][index] = 'man'
            if set(value).issubset(set(nonBinary)):
                knowledgebase['gender'][index] = 'non-binary'
    del knowledgebase['pronoun_text']
    # resolve generic names to generic pronouns
    for index in knowledgebase.index:
        if index in generic_woman or index in abusive_woman:
            knowledgebase['gender'][index] = 'woman'
        elif index in generic_man or index in abusive_man:
            knowledgebase['gender'][index] = 'man'
    return knowledgebase

# build the knowledge base as a pandas data frame
def build_knowledgebase_nomination(text):
    # extract all actors and co-reference them
    actors = combine_names(get_actors(text))
    actors = get_generic_names(text, actors)
    # build the initial knowledgebase and tidy it a bit
    knowledgebase = pd.Series(actors).to_frame()
    knowledgebase.rename(columns={0:'nomination'},inplace=True)
    # add pronouns and gender of the actors
    knowledgebase = get_pronouns_gender(knowledgebase, text)
    return knowledgebase

## Predication

Extract all sentences that contain an actor, if more than one actor in sentence assign the corresponding snetence parts to the actors.

In [6]:
# predication functions

# get sentences that contain an actor
def get_sentences(actor_nomination, actor_predication, text):
    actor_sentences = []
    actor_text = []
    for name in actor_nomination:
        token_span = text[name.i:name.i+1]
        name_text = token_span.sent.text.strip()
        if name_text not in actor_text:
            actor_text.append(name_text)
            actor_sentences.append(token_span.sent)
    for pronoun in actor_predication:
        token_span = text[pronoun.i:pronoun.i+1]
        pronoun_text = token_span.sent.text.strip()
        if pronoun_text not in actor_text:
            actor_text.append(pronoun_text)
            actor_sentences.append(token_span.sent)
    return actor_sentences

def get_predication(knowledgebase, text):
    knowledgebase['predication'] = [list() for x in range(len(knowledgebase.index))]
    for key in knowledgebase.index:
        predication = []
        predication = get_sentences(knowledgebase['nomination'][key], knowledgebase['pronoun'][key], text)
        knowledgebase['predication'][key] = predication

    return knowledgebase

## Discrimination Detection
Compute the following measures:  
- Sentiment and subjectivity per actor  
- Female/male coded words in predication
- How often are actors mentioned (per gender)
- How often are actors mentioned in a positive/negative way (per gender)
- How often is derogatory language used per gender

In [7]:
# discrimination detection functions

# returns -1 for very negative sentiment, 0 for a neutral one and 1 for a very positive sentiment
def get_sentiment(text_list):
    sentiment_list = []
    for text in text_list:
        sentiment_list.append(text._.polarity)
    return sentiment_list


def add_sentiment(knowledgebase):
    knowledgebase['sentiment'] = [list() for x in range(len(knowledgebase.index))]
    for key, value in knowledgebase['predication'].items():
        knowledgebase['sentiment'][key].extend(get_sentiment(value))
    return knowledgebase

# detecting gender coded words in text
def get_gender_words(text, gender):
    feminin_coded = ['agree', 'affectionate', 'child', 'cheer', 'collab',
                    'commit', 'communal', 'compassion', 'connect', 'considerate',
                    'cooperat', 'co-operat', 'depend', 'emotiona', 'empath',
                    'feel', 'flatterable', 'gentle', 'honest', 'interpersonal',
                    'interdependen', 'interpersona', 'inter-personal',
                    'inter-dependen', 'inter-persona', 'kind', 'kinship', 'tender',
                    'together', 'trust', 'understand', 'warm', 'whin', 'enthusias',
                    'inclusive', 'yield', 'share', 'sharin']
    masculin_coded = ['active', 'adventurous', 'aggress', 'ambitio', 'analy', 'assert',
                  'athlet', 'autonom', 'battle', 'boast', 'challeng', 'champion',
                  'compet', 'confident', 'courag', 'decid', 'decision', 'decisive',
                  'persist', 'principle', 'reckless', 'self-confiden',
                  'self-relian', 'self-sufficien', 'selfconfiden', 'selfrelian',
                  'selfsufficien', 'stubborn', 'superior', 'unreasonab']
    coded_words_in_text = []
    if gender == 'woman':
        for stem in feminin_coded:
            if stem in text.text:
                coded_words_in_text.append(stem)
    elif gender == 'man':
        for stem in masculin_coded:
            if stem in text.text.lower():
                coded_words_in_text.append(stem)
    return coded_words_in_text

def add_gender_words(knowledgebase):
    knowledgebase['feminin-coded'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['masculin-coded'] = [list() for x in range(len(knowledgebase.index))]
    for key, value in knowledgebase['predication'].items():
        for sentence in value:
            knowledgebase['feminin-coded'][key].extend(get_gender_words(sentence, 'woman'))
            knowledgebase['masculin-coded'][key].extend(get_gender_words(sentence, 'man'))
    return knowledgebase

def count_mentions_actor(knowledgebase, key):
    number_mentions = 0
    number_mentions += len(knowledgebase['nomination'][key])
    number_mentions += len(knowledgebase['pronoun'][key])
    return number_mentions

def count_mentions_gender(knowledgebase, gender):
    number_mentions = 0
    for key in knowledgebase.index:
        if knowledgebase['gender'][key] == gender:
            number_mentions += count_mentions_actor(knowledgebase, key)
    return number_mentions

def find_abusive_terms(predication):
    abusive_terms = []
    degatory_nomination = abusive_woman + abusive_man + abusive_trans + abusive_general
    for sentence in predication:
        for token in sentence:
            if token.text in degatory_nomination:
                abusive_terms.append(token)
    return abusive_terms


# compile report about a text
def compile_report(text):
    report = {}
    # build knowledgebase
    knowledgebase = build_knowledgebase_nomination(text)
    knowledgebase = get_predication(knowledgebase, text)
    knowledgebase = add_sentiment(knowledgebase)
    knowledgebase = add_gender_words(knowledgebase)

    visualisation_frame_overall = pd.DataFrame()
    visualisation_frame_actor = pd.DataFrame()
    visualisation_frame_actor['gender'] = knowledgebase.loc[:, 'gender']
    visualisation_frame_actor['mention_count'] = 0
    visualisation_frame_actor['sentiment'] = 0
    visualisation_frame_actor['feminin-coded_word_count'] = 0
    visualisation_frame_actor['masculin-coded_word_count'] = 0

    # temporary variables
    number_woman = 0
    number_man = 0
    number_nonBinary = 0
    number_unknowns = 0
    woman_actor_mentions = []
    man_actor_mentions = []
    nonBinary_actor_mentions = []
    unknown_actor_mentions = []
    woman_sentiment = 0
    man_sentiment = 0
    nonBinary_sentiment = 0
    unknown_sentiment = 0
    woman_actor_sentiment = []
    man_actor_sentiment = []
    nonBinary_actor_sentiment = []
    unknown_actor_sentiment = []
    woman_feminin_coded = 0
    man_feminin_coded = 0
    nonBinary_feminin_coded = 0
    unknown_feminin_coded = 0
    woman_masculin_coded = 0
    man_masculin_coded = 0
    nonBinary_masculin_coded = 0
    unknown_masculin_coded = 0
    woman_actor_feminin_coded = []
    man_actor_feminin_coded = []
    nonBinary_actor_feminin_coded = []
    unknown_actor_feminin_coded = []
    woman_actor_masculin_coded = []
    man_actor_masculin_coded = []
    nonBinary_actor_masculin_coded = []
    unknown_actor_masculin_coded = []
    woman_abusive_terms = []
    man_abusive_terms = []
    nonBinary_abusive_terms = []
    unknown_abusive_terms = []
    woman_actor_abusive_terms = []
    man_actor_abusive_terms = []
    nonBinary_actor_abusive_terms = []
    unknown_actor_abusive_terms = []
    frame_gender = []
    frame_actor_count = []
    frame_mention_count = []
    frame_sentiment = []
    frame_feminin_coded = []
    frame_masculin_coded = []

    for key, value in knowledgebase['gender'].items():
        if value == 'woman':
            number_woman += 1
            woman_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            woman_sentiment += sum(knowledgebase['sentiment'][key])
            woman_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            woman_feminin_coded += len(knowledgebase['feminin-coded'][key])
            woman_actor_feminin_coded.append([key, len(knowledgebase['feminin-coded'][key])])
            woman_masculin_coded += len(knowledgebase['masculin-coded'][key])
            woman_actor_masculin_coded.append([key, len(knowledgebase['masculin-coded'][key])])
            woman_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            woman_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        elif value == 'man':
            number_man += 1
            man_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            man_sentiment += sum(knowledgebase['sentiment'][key])
            man_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            man_feminin_coded += len(knowledgebase['feminin-coded'][key])
            man_actor_feminin_coded.append([key, len(knowledgebase['feminin-coded'][key])])
            man_masculin_coded += len(knowledgebase['masculin-coded'][key])
            man_actor_masculin_coded.append([key, len(knowledgebase['masculin-coded'][key])])
            man_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            man_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        elif value == 'nonBinary':
            number_nonBinary += 1
            nonBinary_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            nonBinary_sentiment += sum(knowledgebase['sentiment'][key])
            nonBinary_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            nonBinary_feminin_coded += len(knowledgebase['feminin-coded'][key])
            nonBinary_actor_feminin_coded.append([key, len(knowledgebase['feminin-coded'][key])])
            nonBinary_masculin_coded += len(knowledgebase['masculin-coded'][key])
            nonBinary_actor_masculin_coded.append([key, len(knowledgebase['masculin-coded'][key])])
            nonBinary_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            nonBinary_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        else:
            number_unknowns += 1
            unknown_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            unknown_sentiment += sum(knowledgebase['sentiment'][key])
            unknown_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            unknown_feminin_coded += len(knowledgebase['feminin-coded'][key])
            unknown_actor_feminin_coded.append([key, len(knowledgebase['feminin-coded'][key])])
            unknown_masculin_coded += len(knowledgebase['masculin-coded'][key])
            unknown_actor_masculin_coded.append([key, len(knowledgebase['masculin-coded'][key])])
            unknown_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            unknown_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        visualisation_frame_actor.at[key, 'mention_count'] = count_mentions_actor(knowledgebase, key)
        visualisation_frame_actor.at[key, 'sentiment'] = sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])
        visualisation_frame_actor.at[key, 'feminin-coded_word_count'] = len(knowledgebase['feminin-coded'][key])
        visualisation_frame_actor.at[key, 'masculin-coded_word_count'] = len(knowledgebase['masculin-coded'][key])

    report['overall_woman'] = number_woman
    report['overall_man'] = number_man
    report['overall_nonBinary'] = number_nonBinary
    report['overall_unknown'] = number_unknowns

    report['overall_mentions_woman'] = count_mentions_gender(knowledgebase, 'woman')
    report['overall_mentions_man'] = count_mentions_gender(knowledgebase, 'man')
    report['overall_mentions_nonBinary'] = count_mentions_gender(knowledgebase, 'nonBinary')
    report['overall_mentions_unknown'] = count_mentions_gender(knowledgebase, 'unknown')

    report['overall_feminin_coded_woman'] = woman_feminin_coded
    report['overall_feminin_coded_man'] = man_feminin_coded
    report['overall_feminin_coded_nonBinary'] = nonBinary_feminin_coded
    report['overall_feminin_coded_unknown'] = unknown_feminin_coded

    report['overall_masculin_coded_woman'] = woman_masculin_coded
    report['overall_masculin_coded_man'] = man_masculin_coded
    report['overall_masculin_coded_nonBinary'] = nonBinary_masculin_coded
    report['overall_masculin_coded_unknown'] = unknown_masculin_coded

    report['overall_abusive_terms_woman'] = woman_abusive_terms
    report['overall_abusive_terms_man'] = man_abusive_terms
    report['overall_abusive_terms_nonBinary'] = nonBinary_abusive_terms
    report['overall_abusive_terms_unknown'] = unknown_abusive_terms

    if number_woman == 0:
        report['overall_sentiment_woman'] = 'not mentioned'
    else:
        report['overall_sentiment_woman'] = woman_sentiment / count_mentions_gender(knowledgebase, 'woman')
        frame_gender.append('woman')
        frame_actor_count.append(report['overall_woman'])
        frame_mention_count.append(report['overall_mentions_woman'])
        frame_sentiment.append(report['overall_sentiment_woman'])
        frame_feminin_coded.append(report['overall_feminin_coded_woman'])
        frame_masculin_coded.append(report['overall_masculin_coded_woman'])

    if number_man == 0:
        report['overall_sentiment_man'] = 'not mentioned'
    else:
        report['overall_sentiment_man'] = man_sentiment / count_mentions_gender(knowledgebase, 'man')
        frame_gender.append('man')
        frame_actor_count.append(report['overall_man'])
        frame_mention_count.append(report['overall_mentions_man'])
        frame_sentiment.append(report['overall_sentiment_man'])
        frame_feminin_coded.append(report['overall_feminin_coded_man'])
        frame_masculin_coded.append(report['overall_masculin_coded_man'])

    if number_nonBinary == 0:
        report['overall_sentiment_nonBinary'] = 'not mentioned'
    else:
        report['overall_sentiment_nonBinary'] = nonBinary_sentiment / count_mentions_gender(knowledgebase, 'nonBinary')
        frame_gender.append('nonBinary')
        frame_actor_count.append(report['overall_nonBinary'])
        frame_mention_count.append(report['overall_mentions_nonBinary'])
        frame_sentiment.append(report['overall_sentiment_nonBinary'])
        frame_feminin_coded.append(report['overall_feminin_coded_nonBinary'])
        frame_masculin_coded.append(report['overall_masculin_coded_nonBinary'])

    if number_unknowns == 0:
        report['overall_sentiment_unknown'] = 'not mentioned'
    else:
        report['overall_sentiment_unknown'] = unknown_sentiment / count_mentions_gender(knowledgebase, 'unknown')
        frame_gender.append('unknown')
        frame_actor_count.append(report['overall_unknown'])
        frame_mention_count.append(report['overall_mentions_unknown'])
        frame_sentiment.append(report['overall_sentiment_unknown'])
        frame_feminin_coded.append(report['overall_feminin_coded_unknown'])
        frame_masculin_coded.append(report['overall_masculin_coded_unknown'])

    visualisation_frame_overall['gender'] = frame_gender
    visualisation_frame_overall['actor_count'] = frame_actor_count
    visualisation_frame_overall['mention_count'] = frame_mention_count
    visualisation_frame_overall['sentiment'] = frame_sentiment
    visualisation_frame_overall['feminin-coded_word_count'] = frame_feminin_coded
    visualisation_frame_overall['masculin-coded_word_count'] = frame_masculin_coded

    report['actor_mentions_woman'] = woman_actor_mentions
    report['actor_mentions_man'] = man_actor_mentions
    report['actor_mentions_nonBinary'] = nonBinary_actor_mentions
    report['actor_mentions_unknown'] = unknown_actor_mentions

    report['actor_sentiment_woman'] =  nonBinary_actor_sentiment
    report['actor_sentiment_man'] =  man_actor_sentiment
    report['actor_sentiment_nonBinary'] =  nonBinary_actor_sentiment
    report['actor_sentiment_unknown'] =  unknown_actor_sentiment

    report['actor_feminin_coded_woman'] = woman_actor_feminin_coded
    report['actor_feminin_coded_man'] = man_actor_feminin_coded
    report['actor_feminin_coded_nonBInary'] = nonBinary_actor_feminin_coded
    report['actor_feminin_coded_unknown'] = unknown_actor_feminin_coded

    report['actor_masculin_coded_woman'] = woman_actor_masculin_coded
    report['actor_masculin_coded_man'] = man_actor_masculin_coded
    report['actor_masculin_coded_nonBinary'] = nonBinary_actor_masculin_coded
    report['actor_masculin_coded_unknown'] = unknown_actor_masculin_coded

    report['actor_abusive_terms_woman'] = woman_abusive_terms
    report['actor_abusive_terms_ma'] = man_actor_abusive_terms
    report['actor_abusive_terms_nonBInary'] = nonBinary_actor_abusive_terms
    report['actor_abusive_terms_unknown'] = unknown_actor_abusive_terms

    return [report, visualisation_frame_overall, visualisation_frame_actor, knowledgebase]

## Compile Report

In [12]:
results = compile_report(hillary)
report = results[0]
overall = results[1]
actors = results[2]
knowledgebase = results[3]
overall_max =overall['mention_count'].max()
actors_max = actors['mention_count'].max()

  visualisation_frame_actor.at[key, 'sentiment'] = sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])


In [10]:
report

{'overall_woman': 3,
 'overall_man': 2,
 'overall_nonBinary': 0,
 'overall_unknown': 5,
 'overall_mentions_woman': 29,
 'overall_mentions_man': 34,
 'overall_mentions_nonBinary': 0,
 'overall_mentions_unknown': 10,
 'overall_feminin_coded_woman': 4,
 'overall_feminin_coded_man': 0,
 'overall_feminin_coded_nonBinary': 0,
 'overall_feminin_coded_unknown': 2,
 'overall_masculin_coded_woman': 0,
 'overall_masculin_coded_man': 0,
 'overall_masculin_coded_nonBinary': 0,
 'overall_masculin_coded_unknown': 0,
 'overall_abusive_terms_woman': [],
 'overall_abusive_terms_man': [],
 'overall_abusive_terms_nonBinary': [],
 'overall_abusive_terms_unknown': [],
 'overall_sentiment_woman': 0.04436781609195402,
 'overall_sentiment_man': 0.013333333333333334,
 'overall_sentiment_nonBinary': 'not mentioned',
 'overall_sentiment_unknown': 0.05700000000000001,
 'actor_mentions_woman': [['Christiane Amanpour', 6],
  ['Robbie Vorhaus', 3],
  ['Hillary Clinton', 20]],
 'actor_mentions_man': [['Matthew Dempsey

In [13]:
overall.head()

Unnamed: 0,gender,actor_count,mention_count,sentiment,feminin-coded_word_count,masculin-coded_word_count
0,woman,3,29,0.044368,4,0
1,man,2,34,0.013333,0,0
2,unknown,5,10,0.057,2,0


In [14]:
actors.head()

Unnamed: 0,gender,mention_count,sentiment,feminin-coded_word_count,masculin-coded_word_count
Matthew Dempsey,man,7,0.221667,0,0
Christiane Amanpour,woman,6,0.027778,0,0
Donald Trump,man,27,-0.017639,0,0
Robbie Vorhaus,woman,3,-0.1,0,0
Marsha Blackburn,unknown,2,0.2,0,0


In [15]:
knowledgebase.head()

Unnamed: 0,nomination,pronoun,gender,predication,sentiment,feminin-coded,masculin-coded
Matthew Dempsey,"[scientist, Matthew Dempsey, scientist, Matthe...","[he, he]",man,"[("", Academics, have, long, studied, whether, ...","[-0.01, 0.17500000000000002, 0.5]",[],[]
Christiane Amanpour,"[anchor, Christiane Amanpour, anchor, Christia...","[she, her]",woman,"[("", \n\n, Clinton, made, the, remarks, in, a,...","[0.16666666666666666, 0.0, -0.08333333333333333]",[],[]
Donald Trump,"[Donald Trump, Donald Trump, Donald Trump, Tru...","[He, himself, He, himself, him, He, himself, H...",man,"[("", \n\n, Clinton, made, the, remarks, in, a,...","[0.16666666666666666, -0.16666666666666666, -0...",[],[]
Robbie Vorhaus,"[expert, Robbie Vorhaus]",[she],woman,"[("", How, did, calling, Trump, followers, ', d...","[-0.3, 0.10000000000000002]",[],[]
Marsha Blackburn,"[Sen., Marsha Blackburn]",[],unknown,"[(Sen., Marsha Blackburn, ,, R-Tenn., ,, poste...",[0.2],[],[]


## Report Visualisation

In [None]:
# sentiment overall
plt.ylim(-1, 1)
sns.barplot(x='gender', y='sentiment', data=overall)
# plt.savefig('figures/overall_sentiment.png')

In [None]:
# overall count actors and mentions
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].set(ylim=(0, overall_max))
axs[1].set(ylim=(0, overall_max))
sns.barplot(x='gender', y='actor_count', data=overall, ax=axs[0])
sns.barplot(x='gender', y='mention_count', data=overall, ax=axs[1])
# plt.savefig('figures/overall_actor_mention.png')

In [None]:
# overall feminin-coded words and masculin-coded words
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].set(ylim=(0, overall_max))
axs[1].set(ylim=(0, overall_max))
sns.barplot(x='gender', y='feminin-coded_word_count', data=overall, ax=axs[0])
sns.barplot(x='gender', y='masculin-coded_word_count', data=overall, ax=axs[1])
# plt.savefig('figures/overall_gender_coded_words.png')

In [None]:
# overall wordclouds
woman_token = []
man_token = []
nonBinary_token = []
unknown_token = []

woman_text = ''
man_text = ''
nonBinary_text = ''
unknown_text = ''

for key, value in knowledgebase['gender'].items():
    if value == 'woman':
        woman_token.extend(knowledgebase['predication'][key])
    elif value == 'man':
        man_token.extend(knowledgebase['predication'][key])
    elif value == 'nonBinary':
        nonBinary_token.extend(knowledgebase['predication'][key])
    else:
        unknown_token.extend(knowledgebase['predication'][key])
for sentence in woman_token:
    woman_text += sentence.text
for sentence in man_token:
    man_text += sentence.text
for sentence in nonBinary_token:
    nonBinary_text += sentence.text
for sentence in unknown_token:
    unknown_text += sentence.text

In [None]:
if len(woman_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(woman_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('/figures/woman.png')

In [None]:
if len(man_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(man_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/man.png')

In [None]:
if len(nonBinary_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(nonBinary_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/nonBinary.png')

In [None]:
plt.xticks(rotation=90)
plt.ylim(-1, 1)
sns.barplot(x=actors.index, y='sentiment', data=actors, hue='gender')
# plt.savefig('figures/single_sentiment.png', bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].tick_params(axis='x', rotation=90)
axs[1].tick_params(axis='x', rotation=90)
axs[0].set(ylim=(0, actors_max))
axs[1].set(ylim=(0, actors_max))
sns.barplot(x=actors.index, y='female-coded_word_count',hue='gender' , data=actors, ax=axs[0])
sns.barplot(x=actors.index, y='male-coded_word_count', hue='gender', data=actors, ax=axs[1])
# plt.savefig('figures/singel_gender_coded_words.png')

In [None]:
# actor wordclouds

for key, value in knowledgebase['predication'].items():
    text = ''
    for sentence in value:
        text += sentence.text
    print(key)
    wordcloud = WordCloud(background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/' + key + '.png')