In [1]:
### INSTALL & IMPORT DEPENDENCIES
import numpy as np
import pandas as pd
import datetime as dt
# pip install pmaw
import pmaw
from pmaw import PushshiftAPI

In [2]:
### FIELDS
# Dont need r/
subred_list = ["Singapore", "SGExams", "mentalhealthsg", "SingaporeRaw", "nationalservicesg", 
                "askSingapore", "NUS", "NTU", "SMU_Singpore", "SingaporePoly", "singaporefi"]
# Edit this according to the keywords of interest
keywords = ["depression", "mental illness", "unalive", "social anxiety", "loneliness", 
            "stress", "lonely", "isolation", "suicide", "abuse"]
fields = ["subreddit", "subreddit_subscribers", "title", "id", "author", 
            "created_utc", "num_comments", "score", "selftext", "url", "upvote_ratio"]
start = int(dt.datetime(2019,1,1,0,0).timestamp())
end = int(dt.datetime(2021,6,30,23,59).timestamp())

In [3]:
### FIELDS TO TEST CODE
# Skip this unless you wish to test code with less keywords and subreddits
subred_list = ["SGExams"]
keywords = ["mental illness"]
fields = ["subreddit", "subreddit_subscribers", "title", "id", "author", 
            "created_utc", "num_comments", "score", "selftext", "url", "upvote_ratio"]
start = int(dt.datetime(2019,1,1,0,0).timestamp())
end = int(dt.datetime(2021,6,30,23,59).timestamp())

In [4]:
### SCRAPE SUBREDDITS POSTS
# Initialize pushshift
reddit = PushshiftAPI()
# Scrape function
def scrape_reddit(reddit, subred_list, keywords, fields, start, end):
    # Initialize container
    posts = []
    # Get posts with keywords in any part of its contents at every subreddit
    for subred in subred_list:
        for keyword in keywords:
            submissions = reddit.search_submissions(subreddit = subred, q = keyword, 
                                                    fields = fields, 
                                                    after = start, before = end)
            submission_df = pd.DataFrame(submissions)
            posts.append(submission_df)
    # Combine dataframes
    posts = pd.concat(posts)
    # Output one dataframe
    return posts

In [5]:
data = scrape_reddit(reddit, subred_list, keywords, fields, start, end)
print(data)

27 result(s) available in Pushshift
Total:: Success Rate: 100.00% - Requests: 9 - Batches: 1 - Items Remaining: 0
                  author  created_utc      id  num_comments  score  \
0               neomanui   1560580758  c0ug4a            12     45   
1         247sadboihours   1563543843  cf7q6s            11      3   
2            Shadow_Vamp   1562318917  c9dz9d            14     38   
3             OnlyKotori   1551092311  auk0os             4      1   
4        icebergfeelings   1549273471  an05nz            45      1   
5          ilovecwookies   1576137109  e9knpc             6      1   
6          ilovecwookies   1576131211  e9jrbq             1      1   
7      Testing350500_rdt   1583551617  fepdk3             1      1   
8            shiro_hummm   1581667606  f3p4qb            14      1   
9               Fatepaws   1580791419  eykkqr             6      1   
10         thr0waway1503   1591481862  gy07j0             0      1   
11            spookitkat   1591141537  gvisdj 

In [6]:
### GET COMMENTS FROM THE RESPECTIVE POSTS
def get_comments(data):
    # Initialize container
    comments = []
    post_ids = list(data.loc[:,"id"])
    for id in post_ids:
        comm_ids = reddit.search_submission_comment_ids(ids = id)
        comm_id_list = [c_id for c_id in comm_ids]
        comm = reddit.search_comments(ids = comm_id_list, fields = ["body"])
        # We want to get list of lists for "comments" container
        comment_list = [comment.get("body") for comment in comm]
        comments.append(comment_list)
    return comments

In [7]:
comments = get_comments(data)
data["comments"] = comments
print(data)

Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items Remaining: 0
Total:: Success Rate: 100.00% - Requests: 1 - Batches: 1 - Items

In [None]:
### EXPORT POSTS DATA
# Export dataframe as csv to path set -- you will need to edit this according to your local machine
data.to_csv(r"C:\\Users\\20jam\\Documents\\github\\omdena\\reddit_posts.csv", index = False, header = True)

In [None]:
### HELPFUL RESOURCES
# https://github.com/mattpodolak/pmaw
# https://github.com/pushshift/api