# Count pejorative nouns

Based on the paper:
- Palmer, Alexis, Melissa Robinson, and Kristy Philips. 2017. “Illegal Is Not a Noun: Linguistic Form for Detection of Pejorative Nominalizations.” Pp. 91–100 in *Proceedings of the First Workshop on Abusive Language Online.* Vancouver. Retrieved February 17, 2018 (http://www.aclweb.org/anthology/W17-3014).


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import ipyparallel
import os
import sys
import nltk

c = ipyparallel.Client()
view = c.load_balanced_view()

In [2]:
#breaks the dataframe into chunks that can be processed separately
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

#counts pejorative nouns
def process_df(df):
    import pandas as pd
    import nltk
    import numpy as np
    
    keep = lambda pos: "NN" in pos[:2]
    
    def get_pej_noun(txt):
        output = 0
        txt = str(txt)
        
        #words the paper identified as usually pejorative when used as nouns
        bad_nouns = ['female', 'gay', 'illegal', 'poor']
        
        #skip empty comments
        if len(txt) == 0:
            pass
        elif txt == 'nan':
            pass
        elif txt == '[deleted]':
            pass
        elif txt == '[removed]':
            pass
        else:
            #tokenize sentences, then words within them
            words = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(txt)]
            #tag all words for part of speech
            tagged = [word for sent in nltk.pos_tag_sents(words) for word in sent]
            #flatten list and keep only the nouns
            nouns = [word for (word, pos) in tagged if keep(pos)]
            
            #iterate over all nouns
            for n in nouns:
                #check each to see if it contains one of the bad nouns
                for b in bad_nouns:
                    if b in n:
                        output += 1 #count total bad nouns
            
        return output

    df['pej_nouns'] = df.body.apply(get_pej_noun)
        
    return df

In [None]:
def get_files_by_file_size(dirname, reverse=False):
    """ Return list of file paths in directory sorted by file size """

    l = len(dirname)
    # Get list of files
    filepaths = []
    for basename in os.listdir(dirname):
        filename = os.path.join(dirname, basename)
        if os.path.isfile(filename):
            filepaths.append(filename)
    for i in range(len(filepaths)):
        filepaths[i] = (filepaths[i], os.path.getsize(filepaths[i]))
    filepaths.sort(key=lambda filename: filename[1], reverse=reverse)
    for i in range(len(filepaths)):
        filepaths[i] = filepaths[i][0][l+1:]

    return filepaths
           
def get_jobs():
    jobs = []
    
    files = get_files_by_file_size('../../sampled', reverse=False)
    #if 'TwoXChromosomes.tsv' in files:
    #    files.remove('TwoXChromosomes.tsv')
    #    files.insert(0, 'TwoXChromosomes.tsv')
    done = os.listdir('../data/pej_nouns/')
    
    for f in files:
        if f.endswith('tsv'):
            if f in done:
                print('already finished', f)
                pass
            else:
                print('adding', f)
                tmp = {}
                tmp['file'] = '/home/jwlock/research/reddit/sampled/'+f
                tmp['subreddit'] = f[:-4]
                jobs.append(tmp)
    
    return jobs

jobs = get_jobs()

already finished demsocialist.tsv
already finished puppies.tsv
already finished republicans.tsv
already finished CatGifs.tsv
already finished cats.tsv
already finished OhioStateFootball.tsv
already finished GreenParty.tsv
already finished MichiganWolverines.tsv
already finished msu.tsv
already finished StartledCats.tsv
already finished uofm.tsv
already finished PussyPass.tsv
already finished CatsStandingUp.tsv
already finished dogpictures.tsv
already finished OSU.tsv
already finished communism.tsv
already finished TrollXChromosomes_short.tsv
already finished democrats.tsv
already finished Liberal.tsv
adding dogs_short.tsv
adding socialism_short.tsv
adding TheRedPill_short.tsv
adding TwoXChromosomes_short.tsv
adding MensRights_short.tsv
adding Libertarian_short.tsv
adding progressive.tsv
adding pussypassdenied.tsv
adding FULLCOMMUNISM.tsv
adding Republican.tsv
adding Dogtraining.tsv
adding NeutralPolitics.tsv
adding socialism.tsv
adding dogs.tsv
adding TheRedPill.tsv
adding TrollXChromo

In [None]:
while len(jobs)>0:
    j = jobs[0]
    print('Working on', j['subreddit'])
    df = pd.read_csv(j['file'], sep='\t', usecols=['id', 'body'])
    chunks = chunker(df, 1000)
    result = view.map_async(process_df, chunks)
    result.wait_interactive()
    df = pd.concat(result)
    df.to_csv('../data/pej_nouns/'+j['subreddit']+'.tsv', sep='\t', index=False)
    jobs = get_jobs()
    
print('Done!')

1064/3991 tasks finished after  490 s

In [None]:
print('test')