# Project 3: Reddit Scraping & NLP

## Part 1 - Scraping

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
import datetime as dt
import math
import itertools

In [2]:
# pushshift url template
# https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}
# max request size is 100!

### Data Collection

#### Scraping data from mentalhealth subreddit

In [3]:
# Subreddit to be scraped
subreddit = 'mentalhealth'

# Time parameters
after = 1611140400 # epoch timestamp for 1/20/2021 6am GMT -04:00 DST
before = 1618912800 # epoch timestamp for 4/20/2021 6am GMT -04:00 DST

# h/t stack overflow 
# Set up dict for info to collect
posts_data_mh = {'created_utc':[],
              'url':[],
              'full_link':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

count = 0

# Collect up to 2,000 posts as long as there are posts to collect
while True and len(set(posts_data_mh['created_utc'])) <= 1900:
    print(count)
    count += 1*100
    
    posts = get_submissions(subreddit=subreddit,
                            size=100,
                            after=after, #pulls submissions only after this date
                            before=before, #pulls submissions only before this date
                            sort='asc', #returns data with earliest date first
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        after = post['created_utc']

        # Append info to posts_data dict
        posts_data_mh['created_utc'].append(post['created_utc'])
        posts_data_mh['url'].append(post['url'])
        posts_data_mh['full_link'].append(post['full_link'])
        posts_data_mh['id'].append(post['id'])
        posts_data_mh['num_comments'].append(post['num_comments'])
        posts_data_mh['title'].append(post['title'])
        posts_data_mh['selftext'].append(post['selftext'])
        posts_data_mh['subreddit'].append(post['subreddit'])

    time.sleep(1)

# Save posts to dataframe
mentalhealth = pd.DataFrame(posts_data_mh)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = mentalhealth['created_utc'].apply(get_date)
mentalhealth = mentalhealth.assign(timestamp = _timestamp)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [4]:
mentalhealth.shape

(2000, 9)

In [5]:
# Check r/mentalhealth dataframe
mentalhealth.tail()

Unnamed: 0,created_utc,url,full_link,id,num_comments,title,selftext,subreddit,timestamp
1995,1611781802,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,l6eh5e,4,I can't do my math homework because it's causi...,"It's only 8 problems, but he keeps adding step...",mentalhealth,2021-01-27 16:10:02
1996,1611781905,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,l6ei65,4,Venting,I hate my family with every inch of my body. I...,mentalhealth,2021-01-27 16:11:45
1997,1611782375,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,l6emkj,2,Could I be suffering from Bipolar Disorder?,"Hello, I've been struggling with my mental hea...",mentalhealth,2021-01-27 16:19:35
1998,1611783192,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,l6ewwu,2,Covid linked to risk of mental illness and bra...,[https://amp.theguardian.com/world/2021/jan/25...,mentalhealth,2021-01-27 16:33:12
1999,1611783581,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,l6f2vj,4,my mom just called my anxiety annoying,[deleted],mentalhealth,2021-01-27 16:39:41


In [6]:
# Check data types
mentalhealth.dtypes

created_utc              int64
url                     object
full_link               object
id                      object
num_comments             int64
title                   object
selftext                object
subreddit               object
timestamp       datetime64[ns]
dtype: object

In [7]:
# # export to csv
# filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
# mentalhealth.to_csv('/Users/ericrodriguez/Documents/Projects/project 3 - nlp reddit/data/{}_{}.csv'.format(subreddit, filetime), index=False)

#### Scraping data from CoronavirusUS subreddit

In [10]:
# Subreddit to be scraped
subreddit = 'CoronavirusUS'

# Time parameters
after = 1611140400 # epoch timestamp for 1/20/2021 6am GMT -04:00 DST
before = 1618912800 # epoch timestamp for 4/20/2021 6am GMT -04:00 DST

# h/t stack overflow 
# Set up dict for info to collect
posts_data_cv = {'created_utc':[],
              'url':[],
              'full_link':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

count = 0

# Collect up to 2,000 posts as long as there are posts to collect
while True and len(set(posts_data_cv['created_utc'])) <= 1900:
    print(count)
    count += 1*100
    
    posts = get_submissions(subreddit=subreddit,
                            size=100,
                            after=after, #pulls submissions only after this date
                            before=before, #pulls submissions only before this date
                            sort='asc', #returns data with earliest date first
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        after = post['created_utc']

        # Append info to posts_data dict
        posts_data_cv['created_utc'].append(post['created_utc'])
        posts_data_cv['url'].append(post['url'])
        posts_data_cv['full_link'].append(post['full_link'])
        posts_data_cv['id'].append(post['id'])
        posts_data_cv['num_comments'].append(post['num_comments'])
        posts_data_cv['title'].append(post['title'])
        try:
            posts_data_cv['selftext'].append(post['selftext'])
        except KeyError:
            posts_data_cv['selftext'].append("NaN")
        posts_data_cv['subreddit'].append(post['subreddit'])

    time.sleep(1)

# Save posts to dataframe
coronavirus = pd.DataFrame(posts_data_cv)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = coronavirus['created_utc'].apply(get_date)
coronavirus = coronavirus.assign(timestamp = _timestamp)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [None]:
# if arrays are unequal lengths
# coronavirus = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in posts_data_cv.items()]))

In [11]:
coronavirus.shape

(1999, 9)

In [13]:
#evaluate r/CoronavirusUS data frame
coronavirus.head()

Unnamed: 0,created_utc,url,full_link,id,num_comments,title,selftext,subreddit,timestamp
0,1611140877,https://denver.cbslocal.com/2021/01/19/doublin...,https://www.reddit.com/r/CoronavirusUS/comment...,l174fa,22,Doubling Up Masks Creates ‘Obstacle Course’ Fo...,,CoronavirusUS,2021-01-20 06:07:57
1,1611141286,https://www.reddit.com/r/CoronavirusUS/comment...,https://www.reddit.com/r/CoronavirusUS/comment...,l177oa,5,Covid 19 sweating,Every night since after the first week of covi...,CoronavirusUS,2021-01-20 06:14:46
2,1611148694,https://sweepmama.com/2021-how-to-stay-safe-fr...,https://www.reddit.com/r/CoronavirusUS/comment...,l18zuu,0,COVID-19 2021 - How to stay safe from COVID-19...,,CoronavirusUS,2021-01-20 08:18:14
3,1611151980,https://v.redd.it/adcbnpbdwec61,https://www.reddit.com/r/CoronavirusUS/comment...,l19xor,11,64% of Doomers Don’t Believe Fauci Said This:,,CoronavirusUS,2021-01-20 09:13:00
4,1611153348,https://www.freep.com/story/news/health/2021/0...,https://www.reddit.com/r/CoronavirusUS/comment...,l1acqx,54,"Nearly 12,000 doses of Moderna COVID-19 vaccin...",,CoronavirusUS,2021-01-20 09:35:48


In [14]:
# Check data types
coronavirus.dtypes

created_utc              int64
url                     object
full_link               object
id                      object
num_comments             int64
title                   object
selftext                object
subreddit               object
timestamp       datetime64[ns]
dtype: object

In [16]:
# # export to csv
# filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
# coronavirus.to_csv('/Users/ericrodriguez/Documents/Projects/project 3 - nlp reddit/data/{}_{}.csv'.format(subreddit, filetime), index=False)