In [None]:
import requests
import pandas as pd
import time
import random

In [None]:
url = 'https://www.reddit.com/r/boardgames.json'

In [None]:
res = requests.get(url)

In [None]:
res.status_code

Reddit knows that you are using a Chrome browser on a Mac is trying to access the address https://www.reddit.com/r/boardgames.json However, Python has its own default user agent. Since there are so many scripts out there that are already 'hitting' reddit's API, reddit is basically shutting down all Python scripts from accessing its API.

We will change our request a little bit to make it not use the default user agent. 

In [None]:
res = requests.get(url, headers={'User-agent': 'Pony Inc 1.0'})

In [None]:
res.status_code

In [None]:
reddit_dict = res.json()

In [None]:
print(reddit_dict)

In [None]:
reddit_dict.keys()

In [None]:
reddit_dict['kind']

In [None]:
reddit_dict['data']

In [None]:
reddit_dict['data'].keys()

The most important keys are `children` and `after`.

In [None]:
reddit_dict['data']['children']

In [None]:
len(reddit_dict['data']['children'])

In [None]:
reddit_dict['data']['children'][0]

In [None]:
reddit_dict['data']['children'][0].keys()

In [None]:
reddit_dict['data']['children'][0]['kind']

In [None]:
reddit_dict['data']['children'][0]['data']

In [None]:
reddit_dict['data']['children'][0]['data']['subreddit']

The cell directly above gives you the class label, aka your target.

In [None]:
reddit_dict['data']['children'][0]['data']['title']

That's mapping to the first post.

In [None]:
reddit_dict['data']['children'][0]['data']['selftext']

We want to get all these posts into a Pandas DataFrame and thereafter we can save it to a CSV.

In [None]:
posts = [p['data'] for p in reddit_dict['data']['children']]

In [None]:
pd.DataFrame(posts)

In [None]:
pd.DataFrame(posts).to_csv('posts.csv')

In [None]:
reddit_dict['data']['after']

This is the name of the last post.

In [None]:
pd.DataFrame(posts)['name']

In [None]:
reddit_dict['data']['after']

This is the new URL that gives you the next 25 posts.

In [None]:
url + '?after=' + reddit_dict['data']['after']

## Looping through the posts, 25 posts at a time

In [None]:
posts = []
after = None

for a in range(4):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,60)
    print(sleep_duration)
    time.sleep(sleep_duration)

In [None]:
posts = []
after = None

for a in range(4):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('boardgames.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('boardgames.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

In [None]:
len(posts)

In [None]:
pd.DataFrame(posts).to_csv('boardgames.csv', index = False)