# Initial Data Collection

## Import necessary libraries and initialize Reddit object with Reddit developer credentials

In [None]:
import pandas as pd
import praw
import time

# Enter your credentials here
reddit = praw.Reddit(
    client_id = "my-client-id",
    client_secret = "my-client-secret",
    password = "my-password",
    user_agent = "my-user-agent",
    username = "my-username",
)

## Set search criteria

In [None]:
# All sort methods and time filters provided by Reddit
sort = ["relevance", "hot", "top", "new", "comments"] 
past = ["all", "day", "hour", "month", "week", "year"]

# Subreddit name
sub = "sgexams"

# Maximum number of search results; PRAW allows up to 1000
limit = 1000

# Date / time constraints in Unix time
after = 1546272000 # Jan 1 2019 
before = 1000000000000000000 

# Load your search keywords here
word_set = []  

## Retrieve unique IDs of posts

In [None]:
ids = set()
for word in word_set:
    cur = len(ids)
    print(word)
    for s in sort:
        for p in past:
            try:
                part = reddit.subreddit(sub).search(word, sort=s, limit=limit, time_filter = p)
                for post in part:
                    if post.created_utc < after or post.created_utc > before:
                        continue
                    ids.add(post.id)
            except:
                    print("Rate limit, waiting 60 seconds")
                    time.sleep(60)
    print(f"New IDs found: {len(ids)-cur}")

## Use unique IDs to retrieve post/comment attributes

In [None]:
id_arr = list(ids)
post_df = pd.DataFrame()
comment_df = pd.DataFrame()

### Break the ID array into subsets of 1000 each to avoid exceeding rate limit
#### Run the cell below repeatedly until IndexError is thrown

In [None]:
post_data = []
comment_data = []

for i in range(1000):
    post = reddit.submission(id_arr[i+len(post_df)])
    try:
        name = post.author.name if post.author else None
    except:
        print("Replace name error, waiting 10 seconds")
        time.sleep(10)
        
    post_data.append({
            'id': post.name, 'title': post.title,
            'body': post.selftext, 'flair': post.link_flair_text, 'created_utc': post.created_utc,
            'author': name,
            'score': post.score, 'upvote_ratio': post.upvote_ratio, 'num_comments': post.num_comments
        })
    
    try:
        post.comments.replace_more(limit=None) 
    except:
        print("Replace more error, waiting 60 seconds")
        time.sleep(60)
        
    for comment in post.comments.list():   
        if isinstance(comment, praw.models.reddit.more.MoreComments):
            continue 
        name = comment.author.name if comment.author else None
        comment_data.append({
            'id': comment.name, 'parent_id': comment.parent_id,
            'body': comment.body, 'link_id': comment.link_id, 'created_utc': comment.created_utc,
            'author': name,
            'score': comment.score, "is_op": comment.is_submitter
        })
        
post_df1 = pd.DataFrame(post_data)
comment_df1 = pd.DataFrame(comment_data)

post_df = pd.concat([post_df, post_df1])
comment_df = pd.concat([comment_df, comment_df1])

print("Posts: ", post_df.shape)
print("Comments: ", comment_df.shape)

## Save posts and comments datasets as .csv

In [None]:
post_df.to_csv("Datasets/sgexams_posts.csv")
comment_df.to_csv("Datasets/sgexams_comments.csv")