In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
base_dir = 'data/merged/'
files = os.listdir(base_dir)
print(len(files))
files

In [None]:
keep_cols = ['controversiality', 'num_reports', 'ATTACK_ON_AUTHOR', 'downs',
             'ATTACK_ON_COMMENTER', 'INFLAMMATORY', 'LIKELY_TO_REJECT', 
             'INCOHERENT', 'UNSUBSTANTIAL',
             'OBSCENE', 'SEVERE_TOXICITY', 'TOXICITY', 'politeness', 'sentiment']

scores = ['controversiality', 'ATTACK_ON_AUTHOR', 
             'ATTACK_ON_COMMENTER', 'INFLAMMATORY', 'LIKELY_TO_REJECT', 
             'INCOHERENT', 'UNSUBSTANTIAL',
             'OBSCENE', 'SEVERE_TOXICITY', 'TOXICITY', 'politeness', 'sentiment']

In [None]:
means = {}
sems = {}
for k in keep_cols:
    means[k] = pd.DataFrame()
    sems[k] = pd.DataFrame()

for f in files:
    name = f[:-4]
    print('working on', name)
    raw = pd.read_csv(base_dir+f, sep='\t', index_col=0)
    raw['date'] = pd.to_datetime(raw.created_utc, unit='s')    
    cols = set(keep_cols) - set(raw.columns.values)
    for c in cols:
        raw[c] = np.nan
        
    g = raw.resample('M', on='date')
    g = g[keep_cols]
    gm = g.mean()
    ge = g.sem()
    for k in keep_cols:    
        means[k] = means[k].merge(gm[[k]].rename(columns={k: name}), 
                                how='outer', left_index=True, right_index=True)
        sems[k] = sems[k].merge(ge[[k]].rename(columns={k: name}), 
                                how='outer', left_index=True, right_index=True)

In [None]:
def plot_v1(means, sems, cols=None, title='Mean rating over time', top=None, bottom=0):
    fig, axs = plt.subplots(figsize=(14,10))

    if cols is None:
        cols = means.columns.values
    for c in cols:
        axs.plot(means.index, means[c])
        axs.fill_between(sems.index, 
                         means[c]-(2*sems[c]), 
                         means[c]+(2*sems[c]), 
                         alpha=0.2)
    axs.legend()
    axs.set_title(title)
    axs.set_ylim(bottom=bottom, top=top)

    plt.show()

def metaplot(topic='main_animals', metric='TOXICITY', top=None):
    
    subreddits = {'animals': ['CatGifs', 'StartledCats', 'cats', 'Dogtraining',
                              'CatsStandingUp', 'puppies', 'dogpictures'],
                  'main_animals': ['dogs_short', 'cats', ],
                  'parties': ['democrats',  'demsocialist', 'GreenParty', 
                               'Republican', 'Libertarian_short'],
                  'Dem_Rep': ['democrats', 'Republican'],
                  'other_politics': ['Liberal', 'progressive', 'FULLCOMMUNISM', 
                                      'communism', 'socialism_short'],
                  'Feminist': ['TwoXChromosomes_short', 'TrollXChromosomes_short'],
                  'MI_Uni': ['uofm', 'OSU', 'msu'],
                  'all': None
                 }
    title = 'Mean ' + metric + ' in subreddits about ' + topic + ' with 95% CI'
    
    bottom=0
    if metric == 'sentiment':
        bottom = None
    
    plot_v1(means[metric],
            sems[metric], 
            cols=subreddits[topic], 
            title=title,
            top=top,
            bottom=bottom
           )
    
    return
print('done')

In [None]:
for s in scores:
    metaplot(topic='main_animals', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='animals', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='MI_Uni', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='Feminist', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='parties', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='Dem_Rep', metric=s, top=None)

In [None]:
for s in scores:
    metaplot(topic='other_politics', metric=s, top=1)