# Header


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import ipyparallel
import os
import sys
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

c = ipyparallel.Client()
view = c.load_balanced_view()

In [None]:
def get_sentiment(txt):
    sents = sent_tokenize(txt)
    model = SentimentIntensityAnalyzer()
    results = []
    for s in sents:
        results.append(model.polarity_scores(s)['compound'])
    return np.mean(results)

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
test = "Here is some sample text. THe text makes me happy. Much like my amazing cat. But not like that garbage windows."
get_sentiment(test)

In [None]:
def process_df(df):
    import pandas as pd
    from nltk.tokenize import sent_tokenize
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    import numpy as np
    
    def get_sentiment(txt):
        sents = sent_tokenize(str(txt))
        model = SentimentIntensityAnalyzer()
        results = []
        for s in sents:
            results.append(model.polarity_scores(s)['compound'])
        return np.mean(results)

    df['sentiment'] = df.body.apply(get_sentiment)
        
    return df

In [None]:
def get_files_by_file_size(dirname, reverse=False):
    """ Return list of file paths in directory sorted by file size """

    l = len(dirname)
    # Get list of files
    filepaths = []
    for basename in os.listdir(dirname):
        filename = os.path.join(dirname, basename)
        if os.path.isfile(filename):
            filepaths.append(filename)
    for i in range(len(filepaths)):
        filepaths[i] = (filepaths[i], os.path.getsize(filepaths[i]))
    filepaths.sort(key=lambda filename: filename[1], reverse=reverse)
    for i in range(len(filepaths)):
        filepaths[i] = filepaths[i][0][l+1:]

    return filepaths
           
def get_jobs():
    jobs = []
    
    files = get_files_by_file_size('../sampled', reverse=False)
    if 'TwoXChromosomes.tsv' in files:
        files.remove('TwoXChromosomes.tsv')
        files.insert(0, 'TwoXChromosomes.tsv')
    done = os.listdir('data/sentiment/')
    
    for f in files:
        if f.endswith('tsv'):
            if f in done:
                print('already finished', f)
                pass
            else:
                print('adding', f)
                tmp = {}
                tmp['file'] = '/home/jwlock/research/reddit/sampled/'+f
                tmp['subreddit'] = f[:-4]
                jobs.append(tmp)
    
    return jobs

jobs = get_jobs()

In [None]:
while len(jobs)>0:
    j = jobs[0]
    print('Working on', j['subreddit'])
    df = pd.read_csv(j['file'], sep='\t')
    chunks = chunker(df, 500)
    result = view.map_async(process_df, chunks)
    result.wait_interactive()
    df = pd.concat(result)
    df.to_csv('data/sentiment/'+j['subreddit']+'.tsv', sep='\t', index=False)
    jobs = get_jobs()
    
print('Done!')