# Nomination and Predication

Detecting how actors are named in a text and what is attributed to them.  
Detect discrimination by analysing the nomination and predication.

## Orgniastional Part

In [1]:
# notebook imports

# general machine learning imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# spacy packages
import spacy
from spacy import displacy
from spacy import tokenizer
import coreferee
from spacytextblob.spacytextblob import SpacyTextBlob

In [None]:
# initialise spacy
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('merge_entities')
nlp.add_pipe('coreferee')
nlp.add_pipe('spacytextblob')

In [None]:
# load text data
with open('Texte/Test_Text_I.txt', 'r') as f:
    text_one = f.read()    
with open('Texte/Test_Text_II.txt', 'r') as f:
    text_two = f.read()    
with open('Texte/Test_Text_III.txt', 'r') as f:
    text_three = f.read()
with open('Texte/Hillary_Clinton.txt', 'r') as f:
    hillary = f.read()
with open('Texte/Bill_Clinton.txt', 'r') as f:
    bill = f.read()

# convert to SpaCy format
text_one = nlp(text_one)
text_two = nlp(text_two)
text_three = nlp(text_three)
hillary = nlp(hillary)
bill = nlp(bill)

## Nomination

SpaCy automaticaly performes tokenisation, dependency parsing, part of speech tagging and named entity recognition.  
Therefore, we only need to identify the entities that represent actors in the text and co-reference all passages that mention the actors.

In [4]:
# nomination functions

# general variables
generic_female = ['woman', 'women', 'mother', 'mom', 'grandmother', 'grandma', 'lady', 'ladies']
generic_male = ['man', 'men', 'father', 'dad', 'grandfather', 'granddad', 'gentleman', 'gentlemen']
# https://en.wikipedia.org/wiki/Category:Pejorative_terms_for_women
# https://en.wikipedia.org/wiki/Category:Pejorative_terms_for_men
# https://genderkit.org.uk/slurs/
# https://en.wiktionary.org/wiki/Category:English_swear_words
abusive_female = ['battle-axe', 'bimbo', 'bitch', 'boseulachi', 'cougar',
                     'crone', 'cunt', 'dyke', 'feminazi', 'floozie', 'hag',
                     'harpy', 'moll', 'nakusha', 'shiksa', 'shrew', 'skintern',
                     'slut', 'spinster', 'trollop ', 'twat', 'virago', 'WAG',
                     'whore', 'skank']
abusive_male = ['cock', 'cuckold', 'dick', 'fop', 'fag', 'himbo','lothario',
                   'mamil', 'manlet', 'motherfucker','neckbeard', 'prick',
                   'incel', 'pansy', 'wanker', 'bastard', 'pussy', 'bugger', 'dickhead']
abusive_trans = ['tranny', 'hermaphrodite', 'shemale', 'heshe',
                    'chick-with-a-dick', 'transgenderist']
abusive_general = ['arse', 'arsehead', 'arsehole', 'ass', 'asshole', 'cocksucker',
                      'fatherfucker', 'fucker', 'motherfucker', 'pigfucker',
                      'sisterfucker', 'brotherfucker', 'turd']

# returns all named entities that are persons and generic names
def get_actors(text):
    actor_dict = {}
    for token in text:
        if token.dep_ == 'compound' and token.head.ent_type_ == 'PERSON':
            if token.head.text in actor_dict:
                actor_dict[token.head.text].append(token)
            else: actor_dict[token.head.text] = [token]
        elif token.ent_type_ == 'PERSON' and "'" not in token.text:
            if token.text in actor_dict:
                actor_dict[token.text].append(token)
            else: actor_dict[token.text] = [token]
    return actor_dict



# scan actor list if key is part of another actor
def combine_names(actor_dict):
    flagged_keys = []

    for key in actor_dict:
        for second_key in actor_dict:
            if key != second_key:
                if key in second_key:
                    actor_dict[second_key].extend(actor_dict[key])
                    flagged_keys.append(key)
    for key in flagged_keys:
        if key in actor_dict:
            del actor_dict[key]
    return actor_dict


def get_generic_names(text, actor_dict):
    generic_names = generic_female + generic_male + abusive_female + abusive_male + abusive_trans
    for token in text:
        if token.text.lower() in generic_names:
                if token.text in actor_dict:
                    actor_dict[token.text.lower()].append(token)
                else: actor_dict[token.text.lower()] = [token]
    return actor_dict


# given a text, extract the pronouns and gender of each actor
def get_pronouns_gender(knowledgebase, text):
    female = ['she', 'her', 'hers', 'herself']
    male = ['he', 'his', 'him', 'himself']
    # https://uwm.edu/lgbtrc/support/gender-pronouns/
    neutral = ['they', 'them', 'thier', 'thiers', 'themself',
                  'zie', 'zim', 'zir', 'zis', 'zieself',
                  'sie', 'hir', 'hirs', 'hirself',
                  'ey', 'em', 'eir', 'eirs', 'eirself',
                  've', 'ver', 'vis', 'vers', 'verself',
                  'tey', 'ter', 'tem', 'ters', 'terself',
                  'e', 'emself',
                  'fae', 'faer', 'faers', 'faerself',
                  'ae', 'aer', 'aers', 'aerself',
                  'per', 'pers', 'perself',
                  'xe', 'xem', 'xyr', 'xyrs', 'xemself',
                  'ze', 'hir', 'hirs', 'hirself']
    knowledgebase['pronoun'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['pronoun_text'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['gender'] = 'unknown'
    # iterate over all pronuns in text
    for token in text:
        if token.pos_ == 'PRON':
           # if pronoun check if it can be resolved to an actor
            resolved_actor = text._.coref_chains.resolve(text[token.i])
            if resolved_actor is not None and len(resolved_actor) == 1:
                for index, value in knowledgebase['nomination'].items():
                    if resolved_actor[0] in value:
                        knowledgebase['pronoun'][index].append(token)
                        knowledgebase['pronoun_text'][index].append(token.text.lower())
                        break
    # resolve pronouns to one gender
    # assign female or male if majority of pronouns referes to one gender
    # at least 5 pronouns are neccessary
    majorety_percent = 0.7
    for index, value in knowledgebase['pronoun_text'].items():
        if len(knowledgebase['pronoun_text'][index]) > 4:
            female_count = 0
            male_count = 0
            for pronoun in value:
                if pronoun in female:
                    female_count += 1
                elif pronoun in male:
                    male_count += 1
            if female_count != 0:
                if female_count / len(knowledgebase['pronoun_text'][index]) >= majorety_percent:
                    knowledgebase['gender'][index] = 'female'
            if male_count != 0 :
                if male_count / len(knowledgebase['pronoun_text'][index]) >= majorety_percent:
                    knowledgebase['gender'][index] = 'male'
            if set(value).issubset(set(neutral)):
                knowledgebase['gender'][index] = 'neutral'
        elif len(knowledgebase['pronoun_text'][index]) != 0:
            if set(value).issubset(set(female)):
                knowledgebase['gender'][index] = 'female'
            if set(value).issubset(set(male)):
                knowledgebase['gender'][index] = 'male'
            if set(value).issubset(set(neutral)):
                knowledgebase['gender'][index] = 'neutral'
    del knowledgebase['pronoun_text']
    # resolve generic names to generic pronouns
    for index in knowledgebase.index:
        if index in generic_female or index in abusive_female:
            knowledgebase['gender'][index] = 'female'
        elif index in generic_male or index in abusive_male:
            knowledgebase['gender'][index] = 'male'
    return knowledgebase

# build the knowledge base as a pandas data frame
def build_knowledgebase_nomination(text):
    # extract all actors and co-reference them
    actors = combine_names(get_actors(text))
    actors = get_generic_names(text, actors)
    # build the initial knowledgebase and tidy it a bit
    knowledgebase = pd.Series(actors).to_frame()
    knowledgebase.rename(columns={0:'nomination'},inplace=True)
    # add pronouns and gender of the actors
    knowledgebase = get_pronouns_gender(knowledgebase, text)
    return knowledgebase

## Predication

Extract all sentences that contain an actor, if more than one actor in sentence assign the corresponding snetence parts to the actors.

In [5]:
# predication functions

# get sentences that contain an actor
def get_sentences(actor_nomination, actor_predication, text):
    actor_sentences = []
    actor_text = []
    for name in actor_nomination:
        token_span = text[name.i:name.i+1]
        name_text = token_span.sent.text.strip()
        if name_text not in actor_text:
            actor_text.append(name_text)
            actor_sentences.append(token_span.sent)
    for pronoun in actor_predication:
        token_span = text[pronoun.i:pronoun.i+1]
        pronoun_text = token_span.sent.text.strip()
        if pronoun_text not in actor_text:
            actor_text.append(pronoun_text)
            actor_sentences.append(token_span.sent)
    return actor_sentences

def get_predication(knowledgebase, text):
    knowledgebase['predication'] = [list() for x in range(len(knowledgebase.index))]
    for key in knowledgebase.index:
        predication = []
        predication = get_sentences(knowledgebase['nomination'][key], knowledgebase['pronoun'][key], text)
        knowledgebase['predication'][key] = predication

    return knowledgebase

## Discrimination Detection
Compute the following measures:  
- Sentiment and subjectivity per actor  
- Female/male coded words in predication
- How often are actors mentioned (per gender)
- How often are actors mentioned in a positive/negative way (per gender)
- How often is derogatory language used per gender

In [6]:
# discrimination detection functions

# returns -1 for very negative sentiment, 0 for a neutral one and 1 for a very positive sentiment
def get_sentiment(text_list):
    sentiment_list = []
    for text in text_list:
        sentiment_list.append(text._.polarity)
    return sentiment_list

def add_sentiment(knowledgebase):
    knowledgebase['sentiment'] = [list() for x in range(len(knowledgebase.index))]
    # knowledgebase['subjectivity'] = [list() for x in range(len(knowledgebase.index))]
    for key, value in knowledgebase['predication'].items():
        knowledgebase['sentiment'][key].extend(get_sentiment(value))
        # knowledgebase['subjectivity'][key].extend(get_subjectivity(value))
    return knowledgebase

# detecting gender coded words in text
def get_gender_words(text, gender):
    female_coded = ['agree', 'affectionate', 'child', 'cheer', 'collab',
                    'commit', 'communal', 'compassion', 'connect', 'considerate',
                    'cooperat', 'co-operat', 'depend', 'emotiona', 'empath',
                    'feel', 'flatterable', 'gentle', 'honest', 'interpersonal',
                    'interdependen', 'interpersona', 'inter-personal',
                    'inter-dependen', 'inter-persona', 'kind', 'kinship', 'tender',
                    'together', 'trust', 'understand', 'warm', 'whin', 'enthusias',
                    'inclusive', 'yield', 'share', 'sharin']
    male_coded = ['active', 'adventurous', 'aggress', 'ambitio', 'analy', 'assert',
                  'athlet', 'autonom', 'battle', 'boast', 'challeng', 'champion',
                  'compet', 'confident', 'courag', 'decid', 'decision', 'decisive',
                  'persist', 'principle', 'reckless', 'self-confiden',
                  'self-relian', 'self-sufficien', 'selfconfiden', 'selfrelian',
                  'selfsufficien', 'stubborn', 'superior', 'unreasonab']
    coded_words_in_text = []
    if gender == 'female':
        for stem in female_coded:
            if stem in text.text:
                coded_words_in_text.append(stem)
    elif gender == 'male':
        for stem in male_coded:
            if stem in text.text.lower():
                coded_words_in_text.append(stem)
    return coded_words_in_text

def add_gender_words(knowledgebase):
    knowledgebase['female-coded'] = [list() for x in range(len(knowledgebase.index))]
    knowledgebase['male-coded'] = [list() for x in range(len(knowledgebase.index))]
    for key, value in knowledgebase['predication'].items():
        for sentence in value:
            knowledgebase['female-coded'][key].extend(get_gender_words(sentence, 'female'))
            knowledgebase['male-coded'][key].extend(get_gender_words(sentence, 'male'))
    return knowledgebase

def count_mentions_actor(knowledgebase, key):
    number_mentions = 0
    number_mentions += len(knowledgebase['nomination'][key])
    number_mentions += len(knowledgebase['pronoun'][key])
    return number_mentions

def count_mentions_gender(knowledgebase, gender):
    number_mentions = 0
    for key in knowledgebase.index:
        if knowledgebase['gender'][key] == gender:
            number_mentions += count_mentions_actor(knowledgebase, key)
    return number_mentions

def find_abusive_terms(predication):
    abusive_terms = []
    degatory_nomination = abusive_female + abusive_male + abusive_trans + abusive_general
    for sentence in predication:
        for token in sentence:
            if token.text in degatory_nomination:
                abusive_terms.append(token)
    return abusive_terms

# compile report about a text
def compile_report(text):
    report = {}
    # build knowledgebase
    knowledgebase = build_knowledgebase_nomination(text)
    knowledgebase = get_predication(knowledgebase, text)
    knowledgebase = add_sentiment(knowledgebase)
    knowledgebase = add_gender_words(knowledgebase)

    visualisation_frame_overall = pd.DataFrame()
    visualisation_frame_actor = pd.DataFrame()
    visualisation_frame_actor['gender'] = knowledgebase.loc[:, 'gender']
    visualisation_frame_actor['mention_count'] = 0
    visualisation_frame_actor['sentiment'] = 0
    visualisation_frame_actor['female-coded_word_count'] = 0
    visualisation_frame_actor['male-coded_word_count'] = 0

    # temporary variables
    number_females = 0
    number_males = 0
    number_neutrals = 0
    number_unknowns = 0
    female_actor_mentions = []
    male_actor_mentions = []
    neutral_actor_mentions = []
    unknown_actor_mentions = []
    female_sentiment = 0
    male_sentiment = 0
    neutral_sentiment = 0
    unknown_sentiment = 0
    female_actor_sentiment = []
    male_actor_sentiment = []
    neutral_actor_sentiment = []
    unknown_actor_sentiment = []
    female_female_coded = 0
    male_female_coded = 0
    neutral_female_coded = 0
    unknown_female_coded = 0
    female_male_coded = 0
    male_male_coded = 0
    neutral_male_coded = 0
    unknown_male_coded = 0
    female_actor_female_coded = []
    male_actor_female_coded = []
    neutral_actor_female_coded = []
    unknown_actor_female_coded = []
    female_actor_male_coded = []
    male_actor_male_coded = []
    neutral_actor_male_coded = []
    unknown_actor_male_coded = []
    female_abusive_terms = []
    male_abusive_terms = []
    neutral_abusive_terms = []
    unknown_abusive_terms = []
    female_actor_abusive_terms = []
    male_actor_abusive_terms = []
    neutral_actor_abusive_terms = []
    unknown_actor_abusive_terms = []
    frame_gender = []
    frame_actor_count = []
    frame_mention_count = []
    frame_sentiment = []
    frame_female_coded = []
    frame_male_coded = []

    for key, value in knowledgebase['gender'].items():
        if value == 'female':
            number_females += 1
            female_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            female_sentiment += sum(knowledgebase['sentiment'][key])
            female_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            female_female_coded += len(knowledgebase['female-coded'][key])
            female_actor_female_coded.append([key, len(knowledgebase['female-coded'][key])])
            female_male_coded += len(knowledgebase['male-coded'][key])
            female_actor_male_coded.append([key, len(knowledgebase['male-coded'][key])])
            female_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            female_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        elif value == 'male':
            number_males += 1
            male_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            male_sentiment += sum(knowledgebase['sentiment'][key])
            male_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            male_female_coded += len(knowledgebase['female-coded'][key])
            male_actor_female_coded.append([key, len(knowledgebase['female-coded'][key])])
            male_male_coded += len(knowledgebase['male-coded'][key])
            male_actor_male_coded.append([key, len(knowledgebase['male-coded'][key])])
            male_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            male_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        elif value == 'neutral':
            number_neutrals += 1
            neutral_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            neutral_sentiment += sum(knowledgebase['sentiment'][key])
            neutral_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            neutral_female_coded += len(knowledgebase['female-coded'][key])
            neutral_actor_female_coded.append([key, len(knowledgebase['female-coded'][key])])
            neutral_male_coded += len(knowledgebase['male-coded'][key])
            neutral_actor_male_coded.append([key, len(knowledgebase['male-coded'][key])])
            neutral_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            neutral_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        else:
            number_unknowns += 1
            unknown_actor_mentions.append([key, count_mentions_actor(knowledgebase, key)])
            unknown_sentiment += sum(knowledgebase['sentiment'][key])
            unknown_actor_sentiment.append([key, sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])])
            unknown_female_coded += len(knowledgebase['female-coded'][key])
            unknown_actor_female_coded.append([key, len(knowledgebase['female-coded'][key])])
            unknown_male_coded += len(knowledgebase['male-coded'][key])
            unknown_actor_male_coded.append([key, len(knowledgebase['male-coded'][key])])
            unknown_abusive_terms.extend(find_abusive_terms(knowledgebase['predication'][key]))
            unknown_actor_abusive_terms.append([key, find_abusive_terms(knowledgebase['predication'][key])])
        visualisation_frame_actor.at[key, 'mention_count'] = count_mentions_actor(knowledgebase, key)
        visualisation_frame_actor.at[key, 'sentiment'] = sum(knowledgebase['sentiment'][key]) / len(knowledgebase['sentiment'][key])
        visualisation_frame_actor.at[key, 'female-coded_word_count'] = len(knowledgebase['female-coded'][key])
        visualisation_frame_actor.at[key, 'male-coded_word_count'] = len(knowledgebase['male-coded'][key])

    report['overall_female'] = number_females
    report['overall_male'] = number_males
    report['overall_neutral'] = number_neutrals
    report['overall_unknown'] = number_unknowns

    report['overall_mentions_female'] = count_mentions_gender(knowledgebase, 'female')
    report['overall_mentions_male'] = count_mentions_gender(knowledgebase, 'male')
    report['overall_mentions_neutral'] = count_mentions_gender(knowledgebase, 'neutral')
    report['overall_mentions_unknown'] = count_mentions_gender(knowledgebase, 'unknown')

    report['overall_female_coded_female'] = female_female_coded
    report['overall_female_coded_male'] = male_female_coded
    report['overall_female_coded_neutral'] = neutral_female_coded
    report['overall_female_coded_unknown'] = unknown_female_coded

    report['overall_male_coded_female'] = female_male_coded
    report['overall_male_coded_male'] = male_male_coded
    report['overall_male_coded_neutral'] = neutral_male_coded
    report['overall_male_coded_unknown'] = unknown_male_coded

    report['overall_abusive_terms_female'] = female_abusive_terms
    report['overall_abusive_terms_male'] = male_abusive_terms
    report['overall_abusive_terms_neutral'] = neutral_abusive_terms
    report['overall_abusive_terms_unknown'] = unknown_abusive_terms

    if number_females == 0:
        report['overall_sentiment_female'] = 'not mentioned'
    else:
        report['overall_sentiment_female'] = female_sentiment / count_mentions_gender(knowledgebase, 'female')
        frame_gender.append('female')
        frame_actor_count.append(report['overall_female'])
        frame_mention_count.append(report['overall_mentions_female'])
        frame_sentiment.append(report['overall_sentiment_female'])
        frame_female_coded.append(report['overall_female_coded_female'])
        frame_male_coded.append(report['overall_male_coded_female'])

    if number_males == 0:
        report['overall_sentiment_male'] = 'not mentioned'
    else:
        report['overall_sentiment_male'] = male_sentiment / count_mentions_gender(knowledgebase, 'male')
        frame_gender.append('male')
        frame_actor_count.append(report['overall_male'])
        frame_mention_count.append(report['overall_mentions_male'])
        frame_sentiment.append(report['overall_sentiment_male'])
        frame_female_coded.append(report['overall_female_coded_male'])
        frame_male_coded.append(report['overall_male_coded_male'])

    if number_neutrals == 0:
        report['overall_sentiment_neutral'] = 'not mentioned'
    else:
        report['overall_sentiment_neutral'] = neutral_sentiment / count_mentions_gender(knowledgebase, 'neutral')
        frame_gender.append('neutral')
        frame_actor_count.append(report['overall_neutral'])
        frame_mention_count.append(report['overall_mentions_neutral'])
        frame_sentiment.append(report['overall_sentiment_neutral'])
        frame_female_coded.append(report['overall_female_coded_neutral'])
        frame_male_coded.append(report['overall_male_coded_neutral'])

    if number_unknowns == 0:
        report['overall_sentiment_unknown'] = 'not mentioned'
    else:
        report['overall_sentiment_unknown'] = unknown_sentiment / count_mentions_gender(knowledgebase, 'unknown')
        frame_gender.append('unknown')
        frame_actor_count.append(report['overall_unknown'])
        frame_mention_count.append(report['overall_mentions_unknown'])
        frame_sentiment.append(report['overall_sentiment_unknown'])
        frame_female_coded.append(report['overall_female_coded_unknown'])
        frame_male_coded.append(report['overall_male_coded_unknown'])

    visualisation_frame_overall['gender'] = frame_gender
    visualisation_frame_overall['actor_count'] = frame_actor_count
    visualisation_frame_overall['mention_count'] = frame_mention_count
    visualisation_frame_overall['sentiment'] = frame_sentiment
    visualisation_frame_overall['female-coded_word_count'] = frame_female_coded
    visualisation_frame_overall['male-coded_word_count'] = frame_male_coded

    report['actor_mentions_female'] = female_actor_mentions
    report['actor_mentions_male'] = male_actor_mentions
    report['actor_mentions_neutral'] = neutral_actor_mentions
    report['actor_mentions_unknown'] = unknown_actor_mentions

    report['actor_sentiment_female'] =  female_actor_sentiment
    report['actor_sentiment_male'] =  male_actor_sentiment
    report['actor_sentiment_neutral'] =  neutral_actor_sentiment
    report['actor_sentiment_unknown'] =  unknown_actor_sentiment

    report['actor_female_coded_female'] = female_actor_female_coded
    report['actor_female_coded_male'] = male_actor_female_coded
    report['actor_female_coded_neutral'] = neutral_actor_female_coded
    report['actor_female_coded_unknown'] = unknown_actor_female_coded

    report['actor_male_coded_female'] = female_actor_male_coded
    report['actor_male_coded_male'] = male_actor_male_coded
    report['actor_male_coded_neutral'] = neutral_actor_male_coded
    report['actor_male_coded_unknown'] = unknown_actor_male_coded

    report['actor_abusive_terms_female'] = female_actor_abusive_terms
    report['actor_abusive_terms_male'] = male_actor_abusive_terms
    report['actor_abusive_terms_neutral'] = neutral_actor_abusive_terms
    report['actor_abusive_terms_unknown'] = unknown_actor_abusive_terms

    return [report, visualisation_frame_overall, visualisation_frame_actor, knowledgebase]

## Compile Report

In [None]:
results = compile_report(hillary)
report = results[0]
overall = results[1]
actors = results[2]
knowledgebase = results[3]
overall_max =overall['mention_count'].max()
actors_max = actors['mention_count'].max()

In [8]:
report

{'overall_female': 3,
 'overall_male': 2,
 'overall_neutral': 0,
 'overall_unknown': 5,
 'overall_mentions_female': 29,
 'overall_mentions_male': 34,
 'overall_mentions_neutral': 0,
 'overall_mentions_unknown': 10,
 'overall_female_coded_female': 4,
 'overall_female_coded_male': 0,
 'overall_female_coded_neutral': 0,
 'overall_female_coded_unknown': 2,
 'overall_male_coded_female': 0,
 'overall_male_coded_male': 0,
 'overall_male_coded_neutral': 0,
 'overall_male_coded_unknown': 0,
 'overall_abusive_terms_female': [],
 'overall_abusive_terms_male': [],
 'overall_abusive_terms_neutral': [],
 'overall_abusive_terms_unknown': [],
 'overall_sentiment_female': 0.04436781609195402,
 'overall_sentiment_male': 0.013333333333333334,
 'overall_sentiment_neutral': 'not mentioned',
 'overall_sentiment_unknown': 0.05700000000000001,
 'actor_mentions_female': [['Christiane Amanpour', 6],
  ['Robbie Vorhaus', 3],
  ['Hillary Clinton', 20]],
 'actor_mentions_male': [['Matthew Dempsey', 7], ['Donald Tr

In [9]:
overall.head()

Unnamed: 0,gender,actor_count,mention_count,sentiment,female-coded_word_count,male-coded_word_count
0,female,3,29,0.044368,4,0
1,male,2,34,0.013333,0,0
2,unknown,5,10,0.057,2,0


In [10]:
actors.head()

Unnamed: 0,gender,mention_count,sentiment,female-coded_word_count,male-coded_word_count
Matthew Dempsey,male,7,0.221667,0,0
Christiane Amanpour,female,6,0.027778,0,0
Donald Trump,male,27,-0.017639,0,0
Robbie Vorhaus,female,3,-0.1,0,0
Marsha Blackburn,unknown,2,0.2,0,0


In [11]:
knowledgebase.head()

Unnamed: 0,nomination,pronoun,gender,predication,sentiment,female-coded,male-coded
Matthew Dempsey,"[scientist, Matthew Dempsey, scientist, Matthe...","[he, he]",male,"[("", Academics, have, long, studied, whether, ...","[-0.01, 0.17500000000000002, 0.5]",[],[]
Christiane Amanpour,"[anchor, Christiane Amanpour, anchor, Christia...","[she, her]",female,"[("", \n\n, Clinton, made, the, remarks, in, a,...","[0.16666666666666666, 0.0, -0.08333333333333333]",[],[]
Donald Trump,"[Donald Trump, Donald Trump, Donald Trump, Tru...","[He, himself, He, himself, him, He, himself, H...",male,"[("", \n\n, Clinton, made, the, remarks, in, a,...","[0.16666666666666666, -0.16666666666666666, -0...",[],[]
Robbie Vorhaus,"[expert, Robbie Vorhaus]",[she],female,"[("", How, did, calling, Trump, followers, ', d...","[-0.3, 0.10000000000000002]",[],[]
Marsha Blackburn,"[Sen., Marsha Blackburn]",[],unknown,"[(Sen., Marsha Blackburn, ,, R-Tenn., ,, poste...",[0.2],[],[]


## Report Visualisation

In [None]:
# sentiment overall
plt.ylim(-1, 1)
sns.barplot(x='gender', y='sentiment', data=overall)
# plt.savefig('figures/overall_sentiment.png')

In [None]:
# overall count actors and mentions
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].set(ylim=(0, overall_max))
axs[1].set(ylim=(0, overall_max))
sns.barplot(x='gender', y='actor_count', data=overall, ax=axs[0])
sns.barplot(x='gender', y='mention_count', data=overall, ax=axs[1])
# plt.savefig('figures/overall_actor_mention.png')

In [None]:
# overall female-coded words and male-coded words
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].set(ylim=(0, overall_max))
axs[1].set(ylim=(0, overall_max))
sns.barplot(x='gender', y='female-coded_word_count', data=overall, ax=axs[0])
sns.barplot(x='gender', y='male-coded_word_count', data=overall, ax=axs[1])
# plt.savefig('figures/overall_gender_coded_words.png')

In [None]:
# overall wordclouds
female_token = []
male_token = []
neutral_token = []
unknown_token = []

female_text = ''
male_text = ''
neutral_text = ''
unknown_text = ''

for key, value in knowledgebase['gender'].items():
    if value == 'female':
        female_token.extend(knowledgebase['predication'][key])
    elif value == 'male':
        male_token.extend(knowledgebase['predication'][key])
    elif value == 'neutral':
        neutral_token.extend(knowledgebase['predication'][key])
    else:
        unknown_token.extend(knowledgebase['predication'][key])
for sentence in female_token:
    female_text += sentence.text
for sentence in male_token:
    male_text += sentence.text
for sentence in neutral_token:
    neutral_text += sentence.text
for sentence in unknown_token:
    unknown_text += sentence.text

In [None]:
if len(female_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(female_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('/figures/female.png')

In [None]:
if len(male_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(male_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/male.png')

In [None]:
if len(neutral_text) > 0:
    wordcloud = WordCloud(background_color="white").generate(neutral_text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/neutral.png')

In [None]:
plt.xticks(rotation=90)
plt.ylim(-1, 1)
sns.barplot(x=actors.index, y='sentiment', data=actors, hue='gender')
# plt.savefig('figures/single_sentiment.png', bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(ncols=2, layout='constrained')
axs[0].tick_params(axis='x', rotation=90)
axs[1].tick_params(axis='x', rotation=90)
axs[0].set(ylim=(0, actors_max))
axs[1].set(ylim=(0, actors_max))
sns.barplot(x=actors.index, y='female-coded_word_count',hue='gender' , data=actors, ax=axs[0])
sns.barplot(x=actors.index, y='male-coded_word_count', hue='gender', data=actors, ax=axs[1])
# plt.savefig('figures/singel_gender_coded_words.png')

In [None]:
# actor wordclouds

for key, value in knowledgebase['predication'].items():
    text = ''
    for sentence in value:
        text += sentence.text
    print(key)
    wordcloud = WordCloud(background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    # wordcloud.to_file('figures/' + key + '.png')