# Project 3: Reddit data scraping

## Part 1 - Scraping

In [5]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
import datetime as dt
import math
import itertools

In [3]:
# pushshift url template
# https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}
# max request size is 100!

## Data Collection

### Scraping data from mentalhealth subreddit

In [23]:
# Subreddit to be scraped
subreddit = 'mentalhealth'

In [25]:
# h/t stack overflow 
# Set up dict for info to collect
posts_data = {'created_utc':[],
              'url':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

before = None
count = 0

# Collect up to 1,250 posts as long as there are posts to collect
while True and len(set(posts_data['created_utc'])) < 1200:
    print(count)
    count += 1*50
    
    posts = get_submissions(subreddit=subreddit,
                            size=50,
                            before=before,
                            sort='desc',
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        before = post['created_utc']

        # Append info to posts_data dict
        posts_data['created_utc'].append(post['created_utc'])
        posts_data['url'].append(post['full_link'])
        posts_data['id'].append(post['id'])
        posts_data['num_comments'].append(post['num_comments'])
        posts_data['title'].append(post['title'])
        posts_data['selftext'].append(post['selftext'])
        posts_data['subreddit'].append(post['subreddit'])

    time.sleep(3)

# Save posts to dataframe
posts_data = pd.DataFrame(posts_data)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = posts_data['created_utc'].apply(get_date)
posts_data = posts_data.assign(timestamp = _timestamp)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150


In [27]:
posts_data.shape

(1200, 8)

In [28]:
posts_data.head()

Unnamed: 0,created_utc,url,id,num_comments,title,selftext,subreddit,timestamp
0,1614791880,https://www.reddit.com/r/mentalhealth/comments...,lwz48u,8,What to do during an anxiety attack?,[deleted],mentalhealth,2021-03-03 12:18:00
1,1614791523,https://www.reddit.com/r/mentalhealth/comments...,lwyz3p,1,Even though I wanted someone to notice how I'm...,[deleted],mentalhealth,2021-03-03 12:12:03
2,1614790739,https://www.reddit.com/r/mentalhealth/comments...,lwyn5a,3,Does anyone ever feel like they have an episod...,"it doesn't happen al of the time, but theres a...",mentalhealth,2021-03-03 11:58:59
3,1614790227,https://www.reddit.com/r/mentalhealth/comments...,lwyfv8,4,Need some help and advice please,Hi everyone I’m 19F and this is my first ever ...,mentalhealth,2021-03-03 11:50:27
4,1614790100,https://www.reddit.com/r/mentalhealth/comments...,lwye2m,1,"Im trying to understand myself, but cant seem ...",I cant tell where to start... Im sorry that I ...,mentalhealth,2021-03-03 11:48:20


In [31]:
posts_data.dtypes

created_utc              int64
url                     object
id                      object
num_comments             int64
title                   object
selftext                object
subreddit               object
timestamp       datetime64[ns]
dtype: object

In [45]:
# export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
posts_data.to_csv('/Users/ericrodriguez/Bootcamp/Submissions/Projects/project_3-master/data/{}_{}.csv'.format(subreddit, filetime), index=False)

### Scraping data from Coronavirus subreddit

In [65]:
# Subreddit to be scraped
subreddit = 'Coronavirus'

In [66]:
# Set up dict for info to collect
posts_data = {'created_utc':[],
              'url':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'selftext':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

before = None
count = 0

# Collect up to 1,250 posts as long as there are posts to collect
while True and len(set(posts_data['created_utc'])) < 1200:
    print(count)
    count += 1*50
    
    posts = get_submissions(subreddit=subreddit,
                            size=50,
                            before=before,
                            sort='desc',
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        before = post['created_utc']

        # Append info to posts_data dict
        posts_data['created_utc'].append(post['created_utc'])
        posts_data['url'].append(post['full_link'])
        posts_data['id'].append(post['id'])
        posts_data['num_comments'].append(post['num_comments'])
        posts_data['title'].append(post['title'])
        posts_data['selftext'].append(post['selftext'])
        posts_data['subreddit'].append(post['subreddit'])

    time.sleep(3)

# Save posts to dataframe
posts_data = pd.DataFrame(posts_data)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = posts_data['created_utc'].apply(get_date)
posts_data = posts_data.assign(timestamp = _timestamp)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200


In [68]:
posts_data.shape

(1250, 8)

In [69]:
posts_data.head()

Unnamed: 0,created_utc,url,id,num_comments,title,selftext,subreddit,timestamp
0,1614894046,https://www.reddit.com/r/Coronavirus/comments/...,lxw6ds,0,Are Face Masks Effective? The Evidence.,,Coronavirus,2021-03-04 16:40:46
1,1614793983,https://www.reddit.com/r/Coronavirus/comments/...,lwzyb8,222,Texas vaccine rollout ranks 48th among states ...,,Coronavirus,2021-03-03 12:53:03
2,1614793915,https://www.reddit.com/r/Coronavirus/comments/...,lwzxe4,1,Texas Vaccine Rollout Ranks 48th Among States ...,[deleted],Coronavirus,2021-03-03 12:51:55
3,1614793266,https://www.reddit.com/r/Coronavirus/comments/...,lwznmt,3,"In The Biggest Jump of The Pandemic, Brazil Se...",,Coronavirus,2021-03-03 12:41:06
4,1614793259,https://www.reddit.com/r/Coronavirus/comments/...,lwzniu,0,Texas &amp; Mississippi Both Lift Mask Mandate...,,Coronavirus,2021-03-03 12:40:59


In [70]:
# export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
posts_data.to_csv('/Users/ericrodriguez/Bootcamp/Submissions/Projects/project_3-master/data/{}_{}.csv'.format(subreddit, filetime), index=False)

In [None]:
# potential update to code to change both date, but also subreddit, and how to push through any issues

# present_utc = something, look into UTC time conversion
# do it for start of covid to see how mental health and USA covid subreddit compared
# then do it for a final UTC, or set it to X amount of posts
# don't forget ot convert the dictionary to JSON after data collection
