In [67]:
import re
import json
from pathlib import Path
import datetime as dt

import pandas as pd
import tldextract

In [53]:
# Config params

# Directory where all data will be stored
DATA = Path('../../reddit/data/')

# Subreddits to get comments/posts from 
subreddits = ['Conservative','Democrats','Liberal','Politics','The_Donald']

# Candidate comments
candidates = {
    'JoeBiden': ['joe', 'biden'],
    'ElizabethWarren': ['elizabeth', 'warren'],
    'BernieSanders': ['bernie', 'sanders'],
    'DonaldTrump': ['donald', 'trump']
}

# Filtering thresholds
score_cutoff = 100
min_len = 10

### 1. Posts

#### Consolidate posts from all subreddits

In [83]:
%%time

cols_reqd = ['id', 'author', 'subreddit', 'created_utc', 'full_link', 'score', 'title', 'url',
             'subreddit_subscribers']
allposts = list()

for subreddit in subreddits:
    posts = pd.read_csv(DATA/f'posts/raw/{subreddit}.tsv', sep='\t', encoding='utf-8')
    
    # Remove redudant columns
    posts = posts[cols_reqd]

    # Apply filtering conditions
    posts = posts.loc[posts.score > score_cutoff]

    posts['length'] = posts['title'].fillna('').astype(str).apply(len)
    posts = posts.loc[posts.length > min_len]
    
    allposts.append(posts)
    
allposts = pd.concat(allposts).reset_index(drop=True)

CPU times: user 1min 8s, sys: 3.31 s, total: 1min 12s
Wall time: 56.3 s


#### Preprocess post titles.

In [84]:
def remove_URL(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    return text

def remove_numbers(text):
    text = re.sub('\d', ' ', text)
    return text
    
def preprocess_post(post):
    for fn in [remove_URL, remove_numbers]:
        post = fn(post)

    return post

In [85]:
%%time

# Clean text
allposts['title'] = allposts['title'].map(lambda s: preprocess_post(s))

CPU times: user 12 s, sys: 0 ns, total: 12 s
Wall time: 12 s


#### Extract domain from URL

In [86]:
%%time

allposts['domain'] = allposts['url'].map(lambda s: tldextract.extract(s).domain)

CPU times: user 11.1 s, sys: 0 ns, total: 11.1 s
Wall time: 11.1 s


In [95]:
allposts.to_csv(DATA/'posts/processed/filtered_posts.tsv', sep='\t', encoding='utf-8', index=None)

### 2. Comments