### A python notebook for downloading subreddits aggregate counts for each day in a time range using Reddit's Pushshift API

In [85]:
import requests
import json
import pandas as pd

In [86]:
#http://api.pushshift.io/reddit/submission/search/?subreddit=gardening&aggs=created_utc&after=2020-05-01&before=2020-05-10&limit=0
def construct_url(subreddit, from_date, to_date):
    '''
    Constructs a reddit data aggregator url for downloading the data
    
    Args: base_url - reddit base_url
          subreddit_name - name of the subreddit we want subbreddits for
          from_date - date from which we need the data for the subreddit
          to_date - date until which we need the data for the subreddit
    Returns: returns a constructed url for getting the aggregates of the subreddit
    '''
    return 'http://api.pushshift.io/reddit/submission/search/?subreddit={subreddit_name}&aggs=created_utc&after={fromDate}&before={toDate}&limit=0'.format(subreddit_name = subreddit, fromDate = from_date, toDate = to_date)
     
    

In [106]:
def generate_csv(response, subreddit_name):
    '''
    Returns a CSV file from a reddit URL with aggregate data of a subreddit
    
    Args:response - response from a URL
    
    Returns: a csv file
    '''    
    # Get the Json into a dict format for csv file conversion
    response_dict = response.json()['aggs']['created_utc']
    df = pd.DataFrame(response_dict, index = None)
    #converting epoch time to real time for data analysis
    df['key'] = pd.to_datetime(df['key'],unit='s')
    
    #reindexing the columns
    dataframe = df[['key', 'doc_count']]
    
    #writing the data to a CSV file
    dataframe.to_csv(r'C:\Data\\{subredditName}.csv'.format(subredditName=subreddit_name), index=False)
    
    return "CSV file has been downloaded for r/{subredditName} at C:\Data\\".format(subredditName = subreddit_name)

In [109]:
base_url = 'https://api.pushshift.io/reddit/submission/search/?subreddit='
subreddits = ['gardening', 'recipes', 'embroidery', 'sewing', 'cooking', 'fitness', 'running', 'suggestmeabook', 'hiking', 'camping', 'nakedadventures', 'backpacking', 'baking', 'grilling',
             'stargazing', 'meditation', 'calligraphy', 'yoga', 'knitting', 'crocheting', 'bodyweightfitness', 'dataisbeautiful']

from_date = '2020-01-01'
to_date = '2020-06-10'

for sub in subreddits1:
    url = construct_url(sub, from_date, to_date)
        
    try:
        r = requests.get(url)
        generate_csv(r, sub)
    except requests.exceptions.HTTPError as errh:
        print ("Http Error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("Error Connecting:",errc)
    except requests.exceptions.Timeout as errt:
        print ("Timeout Error:",errt)
    except requests.exceptions.RequestException as err:
        print ("OOps: Something Else",err)
