In [1]:
import csv
import json
import pickle
import time
from datetime import datetime
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix

In [2]:
def read_file(data, cutoff_subreddit, first_row = True, author_row=0, subreddit_row=1, created_row=3, time_cutoff = 1483228800):
    with open(data) as f:
        reader = csv.reader(f)

        # author_history is a dictionary where each key is an author,
        # and each value is a list of (subreddit, timestamp) tuples
        # corresponding with posts made by that author
        author_history = {}

        # first post in sandersforpresident or 2017, whichever is earlier
        author_cutoff = {}

        for row in reader:
            if first_row:
                first_row = False
                continue

            author = row[author_row]
            subreddit = row[subreddit_row]
            created_utc = row[created_row]

            if author in author_history.keys():
                author_history[author].append((subreddit, int(created_utc)))
            else:
                author_history[author] = [(subreddit, int(created_utc))]

            if author not in author_cutoff.keys():
                author_cutoff[author] = time_cutoff

            if subreddit.lower() == cutoff_subreddit and int(created_utc) < author_cutoff[author]:
                author_cutoff[author] = int(created_utc)
        
    return author_history, author_cutoff

In [6]:
sfp_file = '../sample_data/sfp_data_v4.csv'
td_file = '../sample_data/td_data_v4.csv'
neither_file = '../sample_data/neither_data_v4.csv'

In [3]:
sfp_author_history, sfp_author_cutoff = read_file(sfp_file, 'sandersforpresident')

with open("stored_variables/sfp_author_history.json", "w") as f:
    json.dump(sfp_author_history, f)

with open("stored_variables/sfp_author_cutoff.json", "w") as f:
    json.dump(sfp_author_cutoff, f)

ValueError: invalid literal for int() with base 10: '2006-06-15 20:29:57-04'

In [4]:
td_author_history, td_author_cutoff = read_file(td_file, 'the_donald')

with open("stored_variables/td_author_history.json", "w") as f:
    json.dump(td_author_history, f)

with open("stored_variables/td_author_cutoff.json", "w") as f:
    json.dump(td_author_cutoff, f)

In [5]:
neither_author_history, neither_author_cutoff = read_file(neither_file, '!!!!!')

with open("stored_variables/neither_author_history.json", "w") as f:
    json.dump(neither_author_history, f)

with open("stored_variables/neither_author_cutoff.json", "w") as f:
    json.dump(neither_author_cutoff, f)

In [3]:
def get_subreddits_by_author(author_history, author_cutoff):
    subreddits_by_author = {}
    total_posts = {}

    for author in list(author_history.keys()):
        # each entry is a dictionary of subreddit:number of posts in that subreddit by the author
        subreddits_by_author[author] = {}
        total = 0

        for subreddit, timestamp in author_history[author]:

            if int(timestamp) < author_cutoff[author]:                
                total += 1

                if subreddit in subreddits_by_author[author].keys():
                    subreddits_by_author[author][subreddit] += 1
                else:
                    subreddits_by_author[author][subreddit] = 1 

        total_posts[author] = total
    
    return subreddits_by_author, total_posts

In [7]:
sfp_subreddits_by_author, sfp_total_posts =\
    get_subreddits_by_author(sfp_author_history, sfp_author_cutoff)
td_subreddits_by_author, td_total_posts =\
    get_subreddits_by_author(td_author_history, td_author_cutoff)
neither_subreddits_by_author, neither_total_posts =\
    get_subreddits_by_author(neither_author_history, neither_author_cutoff)

In [8]:
with open("stored_variables/sfp_subreddits_by_author.json", "w") as f:
    json.dump(sfp_subreddits_by_author, f)
    
with open("stored_variables/sfp_total_posts.json", "w") as f:
    json.dump(sfp_total_posts, f)

with open("stored_variables/td_subreddits_by_author.json", "w") as f:
    json.dump(td_subreddits_by_author, f)
    
with open("stored_variables/td_total_posts.json", "w") as f:
    json.dump(td_total_posts, f)
    
with open("stored_variables/neither_subreddits_by_author.json", "w") as f:
    json.dump(neither_subreddits_by_author, f)
    
with open("stored_variables/neither_total_posts.json", "w") as f:
    json.dump(neither_total_posts, f)

## /r/Politics

In [4]:
# politics_file = '../sample_data/politics_auth_comments.csv'
politics_file = '../sample_data/politics_auth_comments2.csv'


politics_author_history, politics_author_cutoff = read_file(politics_file, '!!!', first_row = False, created_row=4, time_cutoff = 9483228800) #no cutoff

In [5]:
politics_subreddit_by_author, politics_total_posts =\
    get_subreddits_by_author(politics_author_history, politics_author_cutoff)

In [6]:
# with open("stored_variables/politics_author_history.json", "w") as f:
#     json.dump(politics_author_history, f)
with open("stored_variables/politics_author_history2.json", "w") as f:
    json.dump(politics_author_history, f)

In [7]:
# with open("stored_variables/politics_subreddits_by_author.json", "w") as f:
#     json.dump(politics_subreddit_by_author, f)
with open("stored_variables/politics_subreddits_by_author2.json", "w") as f:
    json.dump(politics_subreddit_by_author, f)

## By month

In [4]:
with open("stored_variables/politics_author_history2.json", "r") as f:
    politics_author_history = json.load(f)

In [7]:
months = [int(datetime(2015, i, 1).timestamp()) for i in range(1,13)] + [int(datetime(2016, i, 1).timestamp()) for i in range(1,7)]

In [8]:
def month_number(timestamp):
    for i in range(len(months)):
        if timestamp < months[i]:
            return i - 1
    return -1

In [9]:
with open("stored_variables/user_subreddit_tf_auths.pickle", "rb") as f:
    author_names = pickle.load(f)

In [10]:
with open("stored_variables/user_subreddit_tf_subreddit_names.pickle", "rb") as f:
    subreddit_reverse_dict = pickle.load(f)

In [23]:
politics_author_history[author_names[0]][0]

['reddit.com', 1136486353]

In [11]:
tf_indptr_lst = []
tf_indices_lst = []
tf_data_lst = []

for i in range(len(months)):
    tf_indptr_lst.append([0])
    tf_indices_lst.append([])
    tf_data_lst.append([])

In [12]:
for i in range(len(author_names)):
    if i % 1000 == 0:
        print(i)
            
    for subreddit, timestamp in politics_author_history[author_names[i]]:
        month_num = month_number(timestamp)
        if month_num > -1 and subreddit.lower() in subreddit_reverse_dict.keys():
            index = subreddit_reverse_dict[subreddit.lower()]
            tf_indices_lst[month_num].append(index)
            tf_data_lst[month_num].append(1)
        
    for j in range(len(months)):
        tf_indptr_lst[j].append(len(tf_indices_lst[j]))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000


In [13]:
subreddit_tf_by_month = []

In [14]:
for i in range(len(months)):
    subreddit_tf_by_month.append(csr_matrix((tf_data_lst[i], tf_indices_lst[i], tf_indptr_lst[i]), 
                                            shape = (len(author_names), len(subreddit_reverse_dict))))

In [15]:
subreddit_tf_by_month = [csr_matrix(coo_matrix(k)) for k in subreddit_tf_by_month]

In [16]:
# for i in range(len(months)):
#     with open("stored_variables/subreddit_tf" + str(i) + ".pickle", 'wb') as f:
#         pickle.dump(subreddit_tf_by_month[i], f)