Reviewers vragen sentiment-gebaseerde analyses. We voegen voor alle woorden in de dataset dit sentiment toe:

    ✓ LIWC
    ✓ SentiWordNet (NLTK  https://github.com/aesuli/SentiWordNet)
    ✓ GI (General Inquirer)
    ✓ Sentic https://sentic.net/
    ? ANEW --> requested
    ? EL


### 1. Load wordlist

Code = first code-blocks of 1_Paper.ipynb

In [23]:
import pandas as pd
import spacy 
from spacy.util import compile_infix_regex
from spacy.tokenizer import Tokenizer
import enchant

data_file_path   = "./data/survey-all.csv"                      # filepath
numberbatch_path = "counterfitting/numberbatch-counterfitted"  # ConceptNet model
nlp = spacy.load(numberbatch_path)


# Avoid splitting of dashes
def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=None,
                                suffix_search=None,
                                infix_finditer=None,
                                token_match=None,
                                rules=None)
nlp.tokenizer = custom_tokenizer(nlp)

from lib.addons_enric import censor_ips
from lib.hard_launch import load_data, remove_anomaly, process_text, construct_participant_level_df

# less for testing
df  = load_data(data_file_path)
df = remove_anomaly(df)
df = censor_ips(df)
df,invalid_word,val_word_cnt,df_corrected_words = process_text(df,nlp)
df = construct_participant_level_df(df)
word_list   = list(val_word_cnt.keys())

Ignoring IP block and distribution channel
Original data has 6145 rows.
After deletion of rows with too few non-repetitive answers: 6145 rows
Current cutoff is set to remove any count < 5
Unrecognized word ratio (# nan / # cells): 14.55%
After deletion of rows with too few (<=0) non-recognized answers: 6143 rows


## 2. Add sentiment

In [81]:
from lib.hard_launch import P1_lemma, P2_lemma

def combine(g):
    return g.sum()

df_words = df[["Source"]+P1_lemma+P2_lemma].groupby("Source").agg(combine).T
df_words = df_words.sum()
df_words = df_words.apply(lambda x: np.unique(x))

srcs, vals = [],[]
for i in df_words.index:
    for w in df_words[i]:
        srcs.append(i)
        vals.append(w)

df_words = pd.DataFrame(vals,srcs)

In [115]:
#### NLTK / SentiWordNet
from nltk.corpus import sentiwordnet as swn
import numpy as np

def score_sentiwordnet(word):
    synset = list(swn.senti_synsets(word))
    
    if(len(list(synset)) > 0):
        scores = list(synset)[0]
        return [scores.neg_score(), scores.pos_score()]
    return [0, 0]

#### LIWC
import liwc
from collections import Counter

parse, category_names = liwc.load_token_parser('./data/sentiment/LIWC2015_English.dic')
def score_LIWC(word):
    results = Counter(category for category in parse(word))
    return [ 
        results['negemo (Negative Emotions)'],
        results['posemo (Positive Emotions)']
    ]

#### General Inquiry
#  http://www.wjh.harvard.edu/~inquirer/
#  https://github.com/cran/SentimentAnalysis/blob/master/R/data.R
GIpos = list(pd.read_csv('./data/sentiment/GIpos.csv')['x'])
GIneg = list(pd.read_csv('./data/sentiment/GIneg.csv')['x'])
def score_GI(word):
    if(word.lower() in GIpos):
        return [0,1]
    if(word.lower() in GIneg):
        return [1,0]
    return [np.nan, np.nan]

##### SenticNet
from senticnet.senticnet import SenticNet
sn = SenticNet()
def score_senticnet(word):
    try:
        polarity_value = float(sn.polarity_value(word))
    except:
        return [np.nan, np.nan]
    
    if(polarity_value > 0):
        return [0, polarity_value]
    return [polarity_value,0]

####### ANEW
anew_list = pd.read_excel("data/sentiment/ANEW.xlsx")
anew_list = dict(zip(anew_list.word, anew_list.score))
def score_anew(word):
    if(word in anew_list):
        return anew_list[word]
    return np.nan

###### Evaluative Lexicon
EL_list = pd.read_csv("data/sentiment/EvaluativeLexicon20.csv")
EL_list = dict(zip(EL_list.Word, EL_list.Valence))
def score_el(word):
    if(word in EL_list):
        return EL_list[word]
    return np.nan

def all_scores(word):
    word = word.lower()
    return np.r_[
        score_LIWC(word),
        score_sentiwordnet(word),
        score_GI(word),
        score_senticnet(word),
        score_anew(word),
        score_el(word)
    ]
sent_vars = ["LIWC_neg","LIWC_pos","SentiWordNet_neg","SentiWordNet_pos","GI_neg","GI_pos","SenticNet_neg","SenticNet_pos","ANEW","EL"];

Apply score functions to the df.

In [116]:
words  = [ w[0] for w in df_words.values ]
scores = [ all_scores(w) for w in words]
df_scored = pd.DataFrame(scores, columns=sent_vars, index=words)
df_scored["Source"] = df_words.index
df_scored.to_csv("output/corpus_sentiments.csv")

In [122]:
pd.DataFrame(np.unique(words)).to_csv("output/unique_words.csv", index=False, header=False)