In [6]:
import json
import pickle
import re
import pandas as pd
import numpy as np
from scipy.sparse import bsr_matrix, coo_matrix, csr_matrix, lil_matrix, identity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from datetime import datetime

In [18]:
chunk_size = 10 ** 6
removed_str = '[removed]'
deleted_str = '[deleted]'
thankyou_str = 'thank you for participating'

In [16]:
comment_auths = []
comment_tokens = []
comment_timestamps = []

In [12]:
months = [datetime(2015, i, 1).timestamp() for i in range(1,13)] + [datetime(2016, i, 1).timestamp() for i in range(1,7)]

In [14]:
def remove_bad_comments(chunk, subreddit='politics'):
    politics_comments = chunk.loc[chunk['subreddit'] == subreddit]
    politics_notna = politics_comments.loc[politics_comments['body'].notna()]
    
    # Time from January 1 2015 to June 1 2016
    politics_time_range = politics_notna.loc[politics_notna['created_utc'] >= 1420131600 ]
    politics_time_range = politics_time_range.loc[politics_time_range['created_utc'] < 1464753600]
    comments = politics_time_range['body'].values
    auths = politics_time_range['author'].values
    timestamps = politics_time_range['created_utc'].values
    
    good_indices = [i for i in range(len(comments)) if comments[i] != deleted_str 
                    and comments[i] != removed_str
                    and 'thank you for participating' not in comments[i].lower()]
        
    return (list(comments[good_indices]), list(auths[good_indices]), list(timestamps[good_indices]))

In [15]:
def tokenize_comments(data):
    # remove hyperlinks
    data1 = [re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data]
    
    # common escape sequences
    data1 = [re.sub(r'\n', '', i) for i in data1]
    data1 = [re.sub(r'&gt;', '', i) for i in data1]
    
    # convert to lowercase
    data1 = [i.lower() for i in data1]
    
    # keep user and subreddit tags
    data1 = [re.sub(r'/u/', '_user_', i) for i in data1]
    data1 = [re.sub(r'/r/', '_subreddit_', i) for i in data1]
    
    data_tokenized = [re.findall(r'\w+', i.lower()) for i in data1]
    
    # remove numbers
    data_tokenized = [[token for token in doc if not token.isnumeric()] for doc in data_tokenized]
    
    return data_tokenized

In [19]:
iter_num = 0

for chunk in pd.read_csv("../sample_data/politics_auth_comments2.csv", 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_utc', 'link_id', 'parent_id'],
                         usecols = ['author','body', 'subreddit', 'created_utc'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    comments, auths, timestamps = remove_bad_comments(chunk)
    comment_toks = tokenize_comments(comments)
    
    comment_auths = comment_auths + auths
    comment_tokens = comment_tokens + comment_toks
    comment_timestamps = comment_timestamps + timestamps
    
    print(iter_num * chunk_size)
    

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000


In [21]:
print(len(comment_auths))
print(len(comment_tokens))
print(len(comment_timestamps))
print(sum([len(i) for i in comment_tokens]))

561573
561573
561573
23463031


In [22]:
comment_tokens_by_month = []
comment_auths_by_month = []

for i in range(17):
    idxs = [j for j in range(len(comment_timestamps)) 
            if comment_timestamps[j] >= months[i] and comment_timestamps[j] < months[i+1] ]
    
    comment_tokens_by_month.append([comment_tokens[j] for j in idxs])
    comment_auths_by_month.append([comment_auths[j] for j in idxs])
    

In [27]:
for i in range(17):
    print(len(comment_tokens_by_month[i]))
    
print()
for i in range(17):
    print(len(comment_auths_by_month[i]))

12885
10362
13376
13967
12345
14627
17254
19120
20064
29191
27542
32123
42738
71798
88621
71175
64385

12885
10362
13376
13967
12345
14627
17254
19120
20064
29191
27542
32123
42738
71798
88621
71175
64385


In [23]:
for i in range(17):
    with open("stored_variables/politics_tokens_auth_sample_month" + str(i) + ".json", 'w') as f:
        json.dump(comment_tokens_by_month[i], f)

In [24]:
for i in range(17):
    with open("stored_variables/auth_sample_r_politics_authors_month" + str(i) + ".json", 'w') as f:
        json.dump(comment_auths_by_month[i], f)