In [1]:
import csv
import json
import time

sfp_file = '../sample_data/sfp_data_v4.csv'
td_file = '../sample_data/td_data_v4.csv'
neither_file = '../sample_data/neither_data_v4.csv'

In [2]:
def read_file(data, cutoff_subreddit, time_cutoff = 1483228800):
    with open(data) as f:
        reader = csv.reader(f)

        first_row = True

        # author_history is a dictionary where each key is an author,
        # and each value is a list of (subreddit, timestamp) tuples
        # corresponding with posts made by that author
        author_history = {}

        # first post in sandersforpresident or 2017, whichever is earlier
        author_cutoff = {}

        for row in reader:
            if first_row:
                first_row = False
                continue

            author = row[0]
            subreddit = row[1]
            created_utc = row[3]

            if author in author_history.keys():
                author_history[author].append((subreddit, int(created_utc)))
            else:
                author_history[author] = [(subreddit, created_utc)]

            if author not in author_cutoff.keys():
                author_cutoff[author] = time_cutoff

            if subreddit.lower() == cutoff_subreddit and int(created_utc) < author_cutoff[author]:
                author_cutoff[author] = int(created_utc)
        
    return author_history, author_cutoff

In [3]:
sfp_author_history, sfp_author_cutoff = read_file(sfp_file, 'sandersforpresident')

with open("stored_variables/sfp_author_history.json", "w") as f:
    json.dump(sfp_author_history, f)

with open("stored_variables/sfp_author_cutoff.json", "w") as f:
    json.dump(sfp_author_cutoff, f)

In [4]:
td_author_history, td_author_cutoff = read_file(td_file, 'the_donald')

with open("stored_variables/td_author_history.json", "w") as f:
    json.dump(td_author_history, f)

with open("stored_variables/td_author_cutoff.json", "w") as f:
    json.dump(td_author_cutoff, f)

In [5]:
neither_author_history, neither_author_cutoff = read_file(neither_file, '!!!!!')

with open("stored_variables/neither_author_history.json", "w") as f:
    json.dump(neither_author_history, f)

with open("stored_variables/neither_author_cutoff.json", "w") as f:
    json.dump(neither_author_cutoff, f)

In [6]:
def get_subreddits_by_author(author_history, author_cutoff):
    subreddits_by_author = {}
    total_posts = {}

    for author in list(author_history.keys()):
        # each entry is a dictionary of subreddit:number of posts in that subreddit by the author
        subreddits_by_author[author] = {}
        total = 0

        for subreddit, timestamp in author_history[author]:

            if int(timestamp) < author_cutoff[author]:                
                total += 1

                if subreddit in subreddits_by_author[author].keys():
                    subreddits_by_author[author][subreddit] += 1
                else:
                    subreddits_by_author[author][subreddit] = 1 

        total_posts[author] = total
    
    return subreddits_by_author, total_posts

In [7]:
sfp_subreddits_by_author, sfp_total_posts =\
    get_subreddits_by_author(sfp_author_history, sfp_author_cutoff)
td_subreddits_by_author, td_total_posts =\
    get_subreddits_by_author(td_author_history, td_author_cutoff)
neither_subreddits_by_author, neither_total_posts =\
    get_subreddits_by_author(neither_author_history, neither_author_cutoff)

In [8]:
with open("stored_variables/sfp_subreddits_by_author.json", "w") as f:
    json.dump(sfp_subreddits_by_author, f)
    
with open("stored_variables/sfp_total_posts.json", "w") as f:
    json.dump(sfp_total_posts, f)

with open("stored_variables/td_subreddits_by_author.json", "w") as f:
    json.dump(td_subreddits_by_author, f)
    
with open("stored_variables/td_total_posts.json", "w") as f:
    json.dump(td_total_posts, f)
    
with open("stored_variables/neither_subreddits_by_author.json", "w") as f:
    json.dump(neither_subreddits_by_author, f)
    
with open("stored_variables/neither_total_posts.json", "w") as f:
    json.dump(neither_total_posts, f)