In [2]:
import json
import pickle
import re
import pandas as pd
import numpy as np
from scipy.sparse import bsr_matrix, coo_matrix, csr_matrix, lil_matrix, identity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
chunk_size = 10 ** 4
removed_str = '[removed]'
deleted_str = '[deleted]'
thankyou_str = 'thank you for participating'

In [35]:
num_removed = 0
num_deleted = 0
num_ty = 0
iter_num = 0

for chunk in pd.read_csv("../sample_data/politics_auth_comments2.csv", 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_utc', 'link_id', 'parent_id'],
                         usecols = ['body', 'subreddit'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    politics_comments = chunk.loc[chunk['subreddit'] == 'politics']
    arr = politics_comments.loc[politics_comments['body'].notna()]['body'].values
    num_removed += len([i for i in arr if i == removed_str])
    num_deleted += len([i for i in arr if i == deleted_str])
    num_ty += len([i for i in arr if 'thank you for participating' in i.lower()])
    
    if iter_num % 100 == 0:
        print(num_removed)
        print(num_deleted)
        print(num_ty)
        print()

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
1

0
1
2

0
1
107

0
1
974

0
1
1764

0
1
2155

0
1
2765

0
1
3347

0
2
3856

0
2
4619

0
2
4952

0
2
5271

0
2
5924

0
2
5924



In [5]:
comment_auths = []
comment_tokens = []

In [3]:
def remove_bad_comments(chunk, subreddit='politics'):
    politics_comments = chunk.loc[chunk['subreddit'] == subreddit]
    politics_notna = politics_comments.loc[politics_comments['body'].notna()]
    # Time from January 1 2015 to June 1 2016
    politics_time_range = politics_notna.loc[politics_notna['created_utc'] >= 1420131600 ]
    politics_time_range = politics_time_range.loc[politics_time_range['created_utc'] < 1464753600]
    comments = politics_time_range['body'].values
    auths = politics_time_range['author'].values
    
    good_indices = [i for i in range(len(comments)) if comments[i] != deleted_str 
                    and comments[i] != removed_str
                    and 'thank you for participating' not in comments[i].lower()]
        
    return (list(comments[good_indices]), list(auths[good_indices]))

In [4]:
def modify_words(data):
    # remove hyperlinks
    data1 = [re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data]
    
    # common escape sequences
    data1 = [re.sub(r'\n', '', i) for i in data1]
    data1 = [re.sub(r'&gt;', '', i) for i in data1]
    
    # convert to lowercase
    data1 = [i.lower() for i in data1]
    
    # keep user and subreddit tags
    data1 = [re.sub(r'/u/', '_user_', i) for i in data1]
    data1 = [re.sub(r'/r/', '_subreddit_', i) for i in data1]
    
    return data1

In [24]:
def tokenize_comments(data):
    # remove hyperlinks
    data1 = [re.sub(r'https?:\/\/[\S]+', ' ', i, flags=re.MULTILINE) for i in data]
    
    # common escape sequences
    data1 = [re.sub(r'\n', '', i) for i in data1]
    data1 = [re.sub(r'&gt;', '', i) for i in data1]
    
    # convert to lowercase
    data1 = [i.lower() for i in data1]
    
    # keep user and subreddit tags
    data1 = [re.sub(r'/u/', '_user_', i) for i in data1]
    data1 = [re.sub(r'/r/', '_subreddit_', i) for i in data1]
    
    data_tokenized = [re.findall(r'\w+', i.lower()) for i in data1]
    
    # remove numbers
    data_tokenized = [[token for token in doc if not token.isnumeric()] for doc in data_tokenized]
    
    return data_tokenized

In [46]:
iter_num = 0

for chunk in pd.read_csv("../sample_data/politics_auth_comments2.csv", 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_utc', 'link_id', 'parent_id'],
                         usecols = ['author','body', 'subreddit', 'created_utc'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    comments, auths = remove_bad_comments(chunk)
    comment_toks = tokenize_comments(comments)
    
    comment_auths = comment_auths + auths
    comment_tokens = comment_tokens + comment_toks
    
    if iter_num % 100 == 0:
        print(iter_num * chunk_size)
    

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000


In [51]:
print(len(comment_auths))
print(len(comment_tokens))
print(sum([len(i) for i in comment_tokens]))

561573
561573
23463031


In [48]:
with open("stored_variables/politics_tokens_auth_sample.json", 'w') as f:
    json.dump(comment_tokens, f)

In [49]:
with open("stored_variables/auth_sample_r_politics_authors.json", 'w') as f:
    json.dump(comment_auths, f)

## CountVectorizer

In [5]:
comment_vocab = set()

In [11]:
chunk_size = 10 ** 6

In [7]:
iter_num = 0

for chunk in pd.read_csv("../sample_data/politics_auth_comments2.csv", 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_utc', 'link_id', 'parent_id'],
                         usecols = ['author','body', 'subreddit', 'created_utc'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    comments, _ = remove_bad_comments(chunk)
    
    if(len(comments) > 0):
        curr_cv = CountVectorizer()
        curr_cv.fit(comments)
        curr_vocab = curr_cv.get_feature_names()
        comment_vocab = comment_vocab.union(curr_vocab)
        print("vocab size: " + str(len(comment_vocab)))

    print(iter_num * chunk_size)

1000000


KeyboardInterrupt: 

In [8]:
all_comments = []

In [12]:
iter_num = 0

for chunk in pd.read_csv("../sample_data/politics_auth_comments2.csv", 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_utc', 'link_id', 'parent_id'],
                         usecols = ['author','body', 'subreddit', 'created_utc'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    comments, _ = remove_bad_comments(chunk)
    
    all_comments = all_comments + list(modify_words(comments))

    print(iter_num * chunk_size)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000


In [13]:
comment_tv = TfidfVectorizer(min_df = 10, max_df = 0.5)
comment_tfidf = comment_tv.fit_transform(all_comments)

In [14]:
comment_tfidf

<561573x26903 sparse matrix of type '<class 'numpy.float64'>'
	with 16284787 stored elements in Compressed Sparse Row format>

In [15]:
with open("stored_variables/politics_tokens_text_tfidf.pickle", 'wb') as f:
    pickle.dump(comment_tfidf, f)