In [1]:
import praw
import pandas as pd
import requests
import time
import math
import sys
import string

reddit = praw.Reddit(client_id='kQoyoJ9Ag4JxTQ', client_secret='fPR3EGxAsC4ERoPHW4HNfxaMsle5Nw', user_agent='nsscraper')
subreddit_name = 'NationalServiceSG'

def submissions_pushshift_praw(subreddit, start=None, end=None, limit=100, extra_query=""):
    """
    A simple function that returns a list of PRAW submission objects during a particular period from a defined sub.
    This function serves as a replacement for the now deprecated PRAW `submissions()` method.
    
    :param subreddit: A subreddit name to fetch submissions from.
    :param start: A Unix time integer. Posts fetched will be AFTER this time. (default: None)
    :param end: A Unix time integer. Posts fetched will be BEFORE this time. (default: None)
    :param limit: There needs to be a defined limit of results (default: 100), or Pushshift will return only 25.
    :param extra_query: A query string is optional. If an extra_query string is not supplied, 
                        the function will just grab everything from the defined time period. (default: empty string)
    
    Submissions are yielded newest first.
    
    For more information on PRAW, see: https://github.com/praw-dev/praw 
    For more information on Pushshift, see: https://github.com/pushshift/api
    """
    matching_praw_submissions = []
    
    # Default time values if none are defined (credit to u/bboe's PRAW `submissions()` for this section)
    utc_offset = 28800
    now = int(time.time())
    start = max(int(start) + utc_offset if start else 0, 0)
    end = min(int(end) if end else now, now) + utc_offset
    
    # Format our search link properly.
    search_link = ('https://api.pushshift.io/reddit/submission/search/'
                   '?subreddit={}&after={}&before={}&sort_type=score&sort=asc&limit={}&q={}')
    search_link = search_link.format(subreddit, start, end, limit, extra_query)
    
    # Get the data from Pushshift as JSON.
    retrieved_data = requests.get(search_link)
    returned_submissions = retrieved_data.json()['data']
    
    i = 0
    # Iterate over the returned submissions to convert them to PRAW submission objects.
    for submission in returned_submissions:
        
        progress(i, len(returned_submissions)-1, status='Collecting posts')
        # Take the ID, fetch the PRAW submission object, and append to our list
        praw_submission = reddit.submission(id=submission['id'])
        matching_praw_submissions.append(praw_submission)
        i += 1
     
    # Return all PRAW submissions that were obtained.
    return matching_praw_submissions

# Progress bar
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()  # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)

Version 7.1.4 of praw is outdated. Version 7.2.0 was released Wednesday February 24, 2021.


In [10]:
# Get posts beginning from Mon Jan 01 2018 00:00:00 GMT+0800 (Singapore Standard Time)
extracted_posts = submissions_pushshift_praw(subreddit_name, start=1514736000, limit=50000) 
# Replaced: sub = reddit.subreddit(subreddit_name).new(limit=100)

posts = []
for p in extracted_posts:
    try:
        posts.append([p.title, p.score, p.id, p.subreddit, p.url, p.num_comments, p.selftext, p.link_flair_template_id, p.link_flair_text, p.created])
    except AttributeError:
        posts.append([p.title, p.score, p.id, p.subreddit, p.url, p.num_comments, p.selftext, "None", "None", p.created])

posts = pd.DataFrame(posts, columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'flair_id', 'flair', 'created'])



In [11]:
posts

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,flair_id,flair,created
0,Thigh muscle pains during NS,0,awrlp6,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,0,[deleted],,,1.551625e+09
1,SCS questions,1,b1f376,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,1,[deleted],,,1.552686e+09
2,MP vocation,2,b1cw7s,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,2,Heard that there is a new MP commander or sth ...,,,1.552672e+09
3,Research on the State of Sport in Singapore (y...,0,bbvkuh,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,2,[removed],,,1.554984e+09
4,"New to NS, Down PES?",0,bfct03,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,6,[deleted],8f1b799a-10be-11e9-a22a-0ebe2cfd4fcc,Question,1.555800e+09
...,...,...,...,...,...,...,...,...,...,...
95,Helpppp,4,dczwil,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,15,[deleted],8f1b799a-10be-11e9-a22a-0ebe2cfd4fcc,Question,1.570178e+09
96,Chances of getting into Commandos/Guards as a PR?,0,gzqle4,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,10,Quick background: I'm a pre-enlistee (will ent...,8f1b799a-10be-11e9-a22a-0ebe2cfd4fcc,Question,1.591749e+09
97,Questions about OOC without down pes,2,h89mjt,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,0,"As the title says, I have OOC’d from SCS pro t...",8f1b799a-10be-11e9-a22a-0ebe2cfd4fcc,Question,1.592091e+09
98,Issue regarding lightduty,0,hum3t2,NationalServiceSG,https://www.reddit.com/r/NationalServiceSG/com...,6,Hi can anyone give me an answer regarding ligh...,8f1b799a-10be-11e9-a22a-0ebe2cfd4fcc,Question,1.595284e+09


In [12]:
# Get all comments per post
all_comments = []
i = 0
for iden in posts['id']:
    progress(i, len(posts.index)-1, status='Collecting comments')
    post = reddit.submission(id=iden)
    post.comments.replace_more(limit=0)
    for c in post.comments.list():
        all_comments.append([c.score, c.id, c.subreddit, c.depth, c.body, c.created, iden])
    i += 1

all_comments = pd.DataFrame(all_comments, columns=['score', 'id', 'subreddit', 'depth', 'body', 'created', 'op_id'])



In [13]:
all_comments

Unnamed: 0,score,id,subreddit,depth,body,created,op_id
0,2,eimu7o2,NationalServiceSG,0,"No, you're a rec on the 10th of March so you'r...",1.552730e+09,b1f376
1,2,eimu9mu,NationalServiceSG,0,"Yep, its stay in and maybe a bit more shiong w...",1.552730e+09,b1cw7s
2,-2,eikslm2,NationalServiceSG,0,Go to unit already then ask. Not all informati...,1.552672e+09,b1cw7s
3,1,ekmgiso,NationalServiceSG,0,Have you tried posting this to r/Singapore? \n...,1.555013e+09,bbvkuh
4,1,ekp8ufe,NationalServiceSG,1,"Yup, I tried but it was immediately taken down...",1.555090e+09,bbvkuh
...,...,...,...,...,...,...,...
589,1,fyr6g6e,NationalServiceSG,3,Got memo from the hospital doc mostly just sta...,1.595358e+09,hum3t2
590,1,fyr82z6,NationalServiceSG,4,what the MO did there is understandable if tha...,1.595360e+09,hum3t2
591,1,fyu8sfh,NationalServiceSG,5,I hope spf mo is same as saf mo regarding saf...,1.595420e+09,hum3t2
592,6,fyrsxnf,NationalServiceSG,0,Elaborate pls. I don't even know what the vide...,1.595373e+09,hv6asg


In [15]:
# Preprocess data
posts.loc[(posts.body == "[removed]"), "body"] = ""
posts.loc[(posts.body == "[deleted]"), "body"] = ""
all_comments.loc[(all_comments.body == "[removed]"), "body"] = "" 
all_comments.loc[(all_comments.body == "[deleted]"), "body"] = "" 

posts.to_csv('../ScrapedOutput/nationalservicesg_posts.csv', index=False)
all_comments.to_csv('../ScrapedOutput/nationalservicesg_comments.csv', index=False)

In [16]:
# Combine the two datasets into one
lastDigit = posts["title"].str.strip().str[-1]
mask = (~lastDigit.isin(list(string.punctuation)))
need_fullstop = posts[mask]
posts.loc[mask, "title"] = need_fullstop["title"] + "."
posts["content"] = posts["title"] + " " + posts["body"]
cleaned_posts = posts.filter(items = ["content"])
cleaned_comments = all_comments.rename(columns = {"body": "content"}).filter(items = ["content"])

textdata = cleaned_posts.append(cleaned_comments, ignore_index = True)
textdata.to_csv('C:/Users/TzeMin/Documents/capstone/BT4103CapstoneProject/ScrapedOutput/nationalservicesg_combineddata.csv', index=False)