# Count of moral words 

Here, I am going to try to compile a moral dicitionary and count occurrences of words within it  in the fragments of text around the SVOs.

I begin by loading the necessary libraries. 

In [1]:
import spacy 
from spacy.matcher import Matcher
import textacy
import pandas as pd 
import numpy as np 
import re
import string 

nlp = spacy.load('en_core_web_sm')

from spacy.symbols import NOUN, PROPN, VERB
from spacy.tokens import Doc, Span, Token
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from nltk.corpus import wordnet

Compile the dictionary. I start from the 'general morality' section in the MFT dictionary (https://moralfoundations.org/wp-content/uploads/files/downloads/moral%20foundations%20dictionary.dic) and look for synonyms to cast a wider net.

In [2]:
moral_dictionary = ["harm", "suffer", "war", "warlord", "fight", "violent", "hurt", "kill", "killer", "endanger", 
"cruel", "brutal", "abuse", "damage", "ruin", "ravage", "detriment", "crush", 
"attack", "annihilate", "destroy", "stomp", "abandon", "spurn", "impair", "exploit", "wound", "unfair", "unequal", "bias", "unjust", "injustice", "bigot", "discriminate", "disproportionate", "inequitable", "prejudice", "dishonest", "unscrupulous", "dissociate", "preference","favoritism", "segregate", "exclusion", "exclude", "foreign", "enemy", "betray", "treason", "traitor", "treachery", "disloyal", "individual", "apostasy", "apostate", "deserted", "deserter", "deceive", "jilt", "imposter", "miscreant", "spy", "sequester", "renegade", "terrorism", "immigration", "defiant", "rebel", "dissent", "subversive", "disrespect", "disobey", "agitator", "insubordinate", "illegal", "lawless", "insurgent", "mutinous", "defy", "dissident", "unfaithful", "alienate", "defector", "heretic", "nonconformist", "oppose", "protest", "refuse", "denounce", "remonstrate", "riot", "obstruct", "disgust", "deprave", "disease", "unclean", "contagion", "indecent", "sin", "sinful", "sinner", "sinned", "slut", "whore", "dirty", "impiety", "impious", "profane", "gross", "repulsive", "sick", "promiscuous", "lewd", "adulterer", "debaucherie", "defile", "tramp", "prostitute", "unchaste", "intemperate", "wanton", "profligate", "filth", "trashy", "obscene", "lax", "taint", "stain", "tarnish", "debase", "desecrate", "wicked","blemish", "exploitation", "pervert", "wretched", "righteous", "moral", "ethic", "value", "upstanding", "good", "goodness", "principle", "blameless", "exemplary", "lesson", "canon", "doctrine", "noble", "worth", "ideal", "praiseworthy", "commendable", "character", "proper", "laudable", "correct", "wrong", "evil", "immoral", "bad", "offend", "offensive", "transgress"]


Stem these words to get the roots. This will make finding modifications easier. 

In [3]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer(language='english')

stemmed_moral = []

for word in moral_dictionary:
    stemmed = stemmer.stem(word)
    stemmed_moral.append(stemmed)

I managed to construct the dictionary. Now, I need to check whether these words (or similar) occur around the triplets of interest. 

Let's import the dataframes of interest. 

In [4]:
# Import full articles
vox = pd.read_csv("~/Documents/moral_templates/Data/vox_articles.csv")
# Drop NAs before continuing with the analysis 
vox = vox.dropna(subset=['clean_strings'])
# Import known SVOs 
already_known = pd.read_csv("~/Documents/moral_templates/Data/known_triplets_sentiments.csv")

Define function for extracting text and test it 

In [7]:
# Just the extract text function for now 
def extract_text(row, padding):
    doc = nlp(vox.iloc[already_known.iloc[row]['Document']]['clean_strings'])
    if already_known.iloc[row]['start']-padding <= 0:
        start = 0
    else:
        start = already_known.iloc[row]['start']-padding
    if already_known.iloc[row]['end']+padding+2 >= len(doc):
        end = len(doc)
    else: 
        end = already_known.iloc[row]['end']+padding+2
    text = doc[start:end]
    string = f"{text}"
    return(string, start, end)

text = extract_text(2, padding=50)
text[0]

"the Drug Enforcement Agency should relax its classification for marijuana, which is currently marked as more dangerous than cocaine, to support more research. The problem isn't limited to just epilepsy, however. It's long been difficult to study marijuana due to federal restrictions. While this list is limited to six examples, it could certainly grow as legalization encourages doctors and researchers to take another look at a drug once marked as taboo."

In [8]:
res = re.sub('['+string.punctuation+']', '',text[0]).split()
print(res)


['the', 'Drug', 'Enforcement', 'Agency', 'should', 'relax', 'its', 'classification', 'for', 'marijuana', 'which', 'is', 'currently', 'marked', 'as', 'more', 'dangerous', 'than', 'cocaine', 'to', 'support', 'more', 'research', 'The', 'problem', 'isnt', 'limited', 'to', 'just', 'epilepsy', 'however', 'Its', 'long', 'been', 'difficult', 'to', 'study', 'marijuana', 'due', 'to', 'federal', 'restrictions', 'While', 'this', 'list', 'is', 'limited', 'to', 'six', 'examples', 'it', 'could', 'certainly', 'grow', 'as', 'legalization', 'encourages', 'doctors', 'and', 'researchers', 'to', 'take', 'another', 'look', 'at', 'a', 'drug', 'once', 'marked', 'as', 'taboo']


It seems to be working fine.

Now, let's stem the fragment and look for coincidences.

In [9]:
res_stemmed = []

for word in res: 
    sw = stemmer.stem(word)
    res_stemmed.append(sw)


In [10]:
len(set(stemmed_moral).intersection(res_stemmed))

0

Three coincidences above. 

Let's write a function that can do this iteratively. 

In [12]:
def count_moral_vocab(row, padding):
    doc = nlp(vox.iloc[already_known.iloc[row]['Document']]['clean_strings'])
    if already_known.iloc[row]['start']-padding <= 0:
        start = 0
    else:
        start = already_known.iloc[row]['start']-padding
    if already_known.iloc[row]['end']+padding+2 >= len(doc):
        end = len(doc)
    else: 
        end = already_known.iloc[row]['end']+padding+2
    text = doc[start:end]
    excerpt = f"{text}"
    excerpt_split = re.sub('['+string.punctuation+']', '',excerpt).split()
    ss_stemmed = []
    for word in excerpt_split: 
        sw = stemmer.stem(word)
        ss_stemmed.append(sw)
    return (len(set(stemmed_moral).intersection(ss_stemmed)), start, end)

Iterate over all the rows of the already known triplets.

In [14]:
list_moral_vocab = []

for x in range(len(already_known)):
    cnt = count_moral_vocab(row = x, padding = 50)
    if (x % 100 == 0):
        print(f'working on row {x}')
    list_moral_vocab.append(cnt)

working on row 0
working on row 100
working on row 200
working on row 300
working on row 400
working on row 500
working on row 600
working on row 700
working on row 800
working on row 900
working on row 1000
working on row 1100
working on row 1200
working on row 1300
working on row 1400
working on row 1500
working on row 1600
working on row 1700
working on row 1800
working on row 1900
working on row 2000
working on row 2100
working on row 2200
working on row 2300
working on row 2400
working on row 2500
working on row 2600
working on row 2700
working on row 2800
working on row 2900
working on row 3000
working on row 3100
working on row 3200
working on row 3300
working on row 3400
working on row 3500
working on row 3600
working on row 3700
working on row 3800
working on row 3900
working on row 4000
working on row 4100
working on row 4200
working on row 4300
working on row 4400
working on row 4500
working on row 4600
working on row 4700
working on row 4800
working on row 4900
working on r

Add column of moral count to the 'already known' dataframe

In [25]:
already_known['moral_count'] = [i[0] for i in list_moral_vocab]

already_known['excerpt_start'] = [i[1] for i in list_moral_vocab]

already_known['excerpt_end'] = [i[2] for i in list_moral_vocab]

Save our new dataframe

In [26]:
already_known.to_csv('~/Documents/moral_templates/Data/triplets_mfd.csv')