# Project 3: Reddit data scraping

## Part 1 - Scraping

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
import datetime as dt
import math
import itertools

In [2]:
# pushshift url template
# https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}
# max request size is 100!

## Data Collection

### Scraping data from mentalhealth subreddit

In [3]:
# Subreddit to be scraped
subreddit = 'mentalhealth'

In [4]:
# Time parameters
after = 1611140400 # epoch timestamp for 1/20/2021 6am GMT -04:00 DST
before = 1618912800 # epoch timestamp for 4/20/2021 6am GMT -04:00 DST

In [17]:
# h/t stack overflow 
# Set up dict for info to collect
posts_data = {'created_utc':[],
              'url':[],
              'full_link':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

count = 0

# Collect up to 2,000 posts as long as there are posts to collect
while True and len(set(posts_data['created_utc'])) <= 1900:
    print(count)
    count += 1*100
    
    posts = get_submissions(subreddit=subreddit,
                            size=100,
                            after=after, #pulls submissions only after this date
                            before=before, #pulls submissions only before this date
                            sort='asc', #returns data with earliest date first
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        after = post['created_utc']

        # Append info to posts_data dict
        posts_data['created_utc'].append(post['created_utc'])
        posts_data['url'].append(post['url'])
        posts_data['full_link'].append(post['full_link'])
        posts_data['id'].append(post['id'])
        posts_data['num_comments'].append(post['num_comments'])
        posts_data['title'].append(post['title'])
        posts_data['selftext'].append(post['selftext'])
        posts_data['subreddit'].append(post['subreddit'])

    time.sleep(1)

# Save posts to dataframe
posts_data = pd.DataFrame(posts_data)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = posts_data['created_utc'].apply(get_date)
posts_data = posts_data.assign(timestamp = _timestamp)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [18]:
posts_data.shape

(2000, 9)

In [20]:
posts_data.head()

Unnamed: 0,created_utc,url,full_link,id,num_comments,title,selftext,subreddit,timestamp
1995,1613341829,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,ljz25o,4,really quite sad and lonely,I feel extremely lonely and sad. I’ve felt lik...,mentalhealth,2021-02-14 17:30:29
1996,1613342047,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,ljz4p5,6,always feel like i'm being watched,which sounds kind of odd. it's... not exactly ...,mentalhealth,2021-02-14 17:34:07
1997,1613342047,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,ljz4p6,6,How can I inform my parent on how to support m...,My father is having difficulties understanding...,mentalhealth,2021-02-14 17:34:07
1998,1613342356,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,ljz882,2,Discrimination of Young People with Mental Hea...,"Hello, I am currently researching stigmas asso...",mentalhealth,2021-02-14 17:39:16
1999,1613342754,https://www.reddit.com/r/mentalhealth/comments...,https://www.reddit.com/r/mentalhealth/comments...,ljzcnn,2,I haven't told anyone this before.,\n\nI am a full-grown adult (33 years old) ...,mentalhealth,2021-02-14 17:45:54


In [None]:
posts_data.dtypes

In [None]:
# export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
posts_data.to_csv('/Users/ericrodriguez/Projects/Submissions/Projects/project_3-nlp_reddit/data/{}_{}.csv'.format(subreddit, filetime), index=False)

### Scraping data from CoronavirusUS subreddit

In [None]:
# Subreddit to be scraped
subreddit = 'CoronavirusUS'

In [None]:
# h/t stack overflow 
# Set up dict for info to collect
posts_data = {'created_utc':[],
              'url':[],
              'full_link':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

count = 0

# Collect up to 2,000 posts as long as there are posts to collect
while True and len(set(posts_data['created_utc'])) <= 1900:
    print(count)
    count += 1*100
    
    posts = get_submissions(subreddit=subreddit,
                            size=100,
                            after=after, #pulls submissions only after this date
                            before=before, #pulls submissions only before this date
                            sort='asc', #returns data with earliest date first
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        after = post['created_utc']

        # Append info to posts_data dict
        posts_data['created_utc'].append(post['created_utc'])
        posts_data['url'].append(post['url'])
        posts_data['full_link'].append(post['full_link'])
        posts_data['id'].append(post['id'])
        posts_data['num_comments'].append(post['num_comments'])
        posts_data['title'].append(post['title'])
        posts_data['selftext'].append(post['selftext'])
        posts_data['subreddit'].append(post['subreddit'])

    time.sleep(1)

# Save posts to dataframe
posts_data = pd.DataFrame(posts_data)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = posts_data['created_utc'].apply(get_date)
posts_data = posts_data.assign(timestamp = _timestamp)

In [None]:
posts_data.shape

In [None]:
posts_data.head()

In [None]:
# export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
posts_data.to_csv('/Users/ericrodriguez/Projects/Submissions/Projects/project_3-nlp_reddit/data/{}_{}.csv'.format(subreddit, filetime), index=False)