# Code for merging together various comment scores

In [1]:
import pandas as pd
import numpy as np
import os

### Directories with data files in them

In [10]:
perspective_files = os.listdir('../data/perspective/')
politeness_files = os.listdir('../data/politeness/')
sentiment_files = os.listdir('../data/sentiment/')
pn_files = os.listdir('../data/pej_nouns/')
raw_files = os.listdir('../data/raw/')
merged_files = os.listdir('../data/merged')

#all_files

### Figure out which scores we have for which subreddits

In [11]:
to_merge = {}

for f in perspective_files:#raw_files:
    if f.endswith('.tsv'):
        if f in merged_files:
            print("Already finished", f)
        else:
            tmp = []
            if f in raw_files:
                tmp.append('../data/raw/')
            if f in perspective_files:
                tmp.append('../data/perspective/')
            if f in politeness_files:
                tmp.append('../data/politeness/')
            if f in sentiment_files:
                tmp.append('../data/sentiment/')
            if f in pn_files:
                tmp.append('../data/pej_nouns/')
            to_merge[f] = tmp
to_merge

Already finished CatGifs.tsv
Already finished TwoXChromosomes.tsv
Already finished NeutralPolitics.tsv
Already finished OhioStateFootball.tsv
Already finished FULLCOMMUNISM.tsv
Already finished history.tsv
Already finished listentothis.tsv
Already finished demsocialist.tsv
Already finished dogs_short.tsv
Already finished LifeProTips.tsv
Already finished Libertarian_short.tsv
Already finished TwoXChromosomes_short.tsv
Already finished gadgets.tsv
Already finished StartledCats.tsv
Already finished Dogtraining.tsv
Already finished Jokes.tsv
Already finished food.tsv
Already finished GetMotivated.tsv
Already finished cats.tsv
Already finished personalfinance.tsv
Already finished DIY.tsv
Already finished TheRedPill.tsv
Already finished InternetIsBeautiful.tsv
Already finished Republican.tsv
Already finished tifu.tsv
Already finished Art.tsv
Already finished Liberal.tsv
Already finished socialism.tsv
Already finished TrollXChromosomes_short.tsv
Already finished books.tsv
Already finished Sho

{'announcements.tsv': ['../data/raw/',
  '../data/perspective/',
  '../data/pej_nouns/']}

In [7]:
#just_one = 'announcements.tsv'
#tmp = {just_one: to_merge[just_one]}
#to_merge = tmp
#to_merge

### Do  the merging

In [8]:
# columns we're not interested in
cols_to_drop = set(['approved_by', 'author_cakeday', 'author_flair_css_class', 
                    'author_flair_text', 'banned_at_utc', 'banned_by', 
                    'can_gild', 'can_mod_post', 'collapsed_reason', 'created',
                    'distinguished', 'downs', 'gilded', 'likes', 'link_id', 
                    'num_reports', 'removal_reason', 'report_reasons',
                    'retrieved_on', 'saved', 'score_hidden', 'subreddit_id', 
                    'Unnamed: 0', 'approved_at_utc', 'name', 'gilded'])

# final column order desired
out_order = ['date', 'author', 'body', 'politeness', 'sentiment', 
               'controversiality', 'TOXICITY', 'ATTACK_ON_COMMENTER', 
               'INFLAMMATORY', 'LIKELY_TO_REJECT', 'OBSCENE', 
               'SEVERE_TOXICITY', 'ATTACK_ON_AUTHOR', 'SPAM', 
               'UNSUBSTANTIAL', 'INCOHERENT', 'pej_nouns', 'ups', 'edited', 'id', 
               'is_submitter', 'link_id', 'parent_id', 'replies', 
               'score', 'subreddit', 'deleted', 'stickied',  
             'archived', 'collapsed']

def is_del(txt):
    result = 0
    if isinstance(txt, str):
        if txt == '[deleted]':
            result = 1
        elif txt == '[removed]':
            result = 1
    return result

# iterates over each subreddit we have data for
for k,v in to_merge.items():
    print(k)
    df = pd.DataFrame()
    # iterates over each file we need to merge for that subreddit
    for p in v:
        print(p)
        #read data
        tmp = pd.read_csv(p+k, sep='\t')
        #figure out which columns we don't already have
        keep = set(tmp.columns.values) - set(df.columns.values)
        keep.add('id') #add ID to merge on, even though we have it already
        keep = list(keep - cols_to_drop) # ignore cols we don't want
        tmp = tmp[keep]
        if df.shape[1]==0: #first iteration just take the dataframe
            df=tmp
        else: # future iterations we merge them
            df = df.merge(tmp, on='id')
    
    # make our dates real dates
    df['date'] = pd.to_datetime(df.created_utc, unit='s')
    
    df['deleted'] = df.body.apply(is_del)
    
    #figure out which scores we don't have yet
    missing = set(out_order) - set(df.columns.values) 
    for m in missing:
        out_order.remove(m)
        
    #reorder the columns 
    df = df[out_order]
    
    #save the merged scores
    df.to_csv('../data/merged/'+k, sep='\t', index=False)
    

announcements.tsv
../data/perspective/
../data/pej_nouns/


AttributeError: 'DataFrame' object has no attribute 'created_utc'

In [9]:
df.head()

Unnamed: 0,INCOHERENT,OBSCENE,SEVERE_TOXICITY,INFLAMMATORY,ATTACK_ON_AUTHOR,SPAM,LIKELY_TO_REJECT,ATTACK_ON_COMMENTER,id,TOXICITY,UNSUBSTANTIAL,pej_nouns,body
0,0.113447,0.983264,0.14462,0.628963,0.025246,0.444484,0.945748,0.02057,cis30kl,0.644397,0.674075,0,Another 5 days since my last post. This change...
1,0.106087,0.009525,0.004739,0.020235,0.008418,0.19254,0.127043,0.015885,cis6ool,0.027712,0.43578,0,"I realize I'm very late, but I've been doing t..."
2,0.658489,0.041463,0.020216,0.057895,0.024516,0.9541,0.924818,0.020601,c3q5d1n,0.056376,0.83077,0,hw can i comment on my page
3,0.531398,0.066519,0.019977,0.121415,0.086791,0.672203,0.406133,0.250606,d1u1sv0,0.055501,0.786852,0,Afaik the windows bridge has been closed and d...
4,0.608255,0.06914,0.009655,0.057709,0.025363,0.990298,0.243013,0.032384,d1u1sv3,0.032827,0.677549,0,installed app thinking it would be better than...


In [None]:
print('done!')