# Code for merging together various comment scores

In [None]:
import pandas as pd
import numpy as np
import os

### Directories with data files in them

In [None]:
perspective_files = os.listdir('../data/perspective/')
politeness_files = os.listdir('../data/politeness/')
sentiment_files = os.listdir('../data/sentiment/')
pn_files = os.listdir('../data/pej_nouns/')
raw_files = os.listdir('../data/raw/')

all_files = set(perspective_files + politeness_files)
#all_files

### Figure out which scores we have for which subreddits

In [None]:
to_merge = {}

for f in all_files:
    if f.endswith('.tsv'):
        tmp = []
        if f in raw_files:
            tmp.append('../data/raw/')
        if f in perspective_files:
            tmp.append('../data/perspective/')
        if f in politeness_files:
            tmp.append('../data/politeness/')
        if f in sentiment_files:
            tmp.append('../data/sentiment/')
        if f in pn_files:
            tmp.append('../data/pej_nouns/')
        to_merge[f] = tmp
to_merge

#### Show example columns

In [None]:
tmp = pd.read_csv('../data/merged/TwoXChromosomes.tsv', sep='\t')
tmp.columns.values

In [None]:
tmp = pd.read_csv('../data/perspective/TwoXChromosomes.tsv', sep='\t')
tmp.columns.values

In [None]:
tmp.head()

### Do  the merging

In [None]:
# columns we're not interested in
cols_to_drop = set(['approved_by', 'author_cakeday', 'author_flair_css_class', 
                    'author_flair_text', 'banned_at_utc', 'banned_by', 
                    'can_gild', 'can_mod_post', 'collapsed_reason', 'created',
                    'distinguished', 'downs', 'gilded', 'likes', 'link_id', 
                    'num_reports', 'removal_reason', 'report_reasons',
                    'retrieved_on', 'saved', 'score_hidden', 'subreddit_id', 
                    'Unnamed: 0', 'approved_at_utc', 'name', 'gilded'])

# final column order desired
out_order = ['date', 'author', 'body', 'politeness', 'sentiment', 
               'controversiality', 'TOXICITY', 'ATTACK_ON_COMMENTER', 
               'INFLAMMATORY', 'LIKELY_TO_REJECT', 'OBSCENE', 
               'SEVERE_TOXICITY', 'ATTACK_ON_AUTHOR', 'SPAM', 
               'UNSUBSTANTIAL', 'INCOHERENT', 'pej_nouns', 'ups', 'edited', 'id', 
               'is_submitter', 'link_id', 'parent_id', 'replies', 
               'score', 'subreddit', 'stickied',  'archived', 'collapsed']

# iterates over each subreddit we have data for
for k,v in to_merge.items():
    print(k)
    df = pd.DataFrame()
    # iterates over each file we need to merge for that subreddit
    for p in v:
        print(p)
        #read data
        tmp = pd.read_csv(p+k, sep='\t')
        #figure out which columns we don't already have
        keep = set(tmp.columns.values) - set(df.columns.values)
        keep.add('id') #add ID to merge on, even though we have it already
        keep = list(keep - cols_to_drop) # ignore cols we don't want
        tmp = tmp[keep]
        #print(tmp.columns.values)
        if df.shape[1]==0: #first iteration just take the dataframe
            df=tmp
        else: # future iterations we merge them
            df = df.merge(tmp, on='id')
    
    # make our dates real dates
    df['date'] = pd.to_datetime(df.created_utc, unit='s')
    
    #print(df.columns.values)
    
    #figure out which scores we don't have yet
    missing = set(out_order) - set(df.columns.values) 
    for m in missing:
        out_order.remove(m)
        
    #reorder the columns 
    df = df[out_order]
    #print(df.columns.values)
    #save the merged scores
    df.to_csv('../data/merged/'+k, sep='\t', index=False)
    

In [None]:
print('done!')