## Scraper for r/jokes subreddit that will be used to train NLP model

In [12]:
from collections import defaultdict
import datetime
from datetime import datetime
import pandas as pd
import praw
import time

# Test request of jokes subreddit, hot posts, limited
import r_auth

# Set reddit api reqs
reddit = praw.Reddit(client_id = r_auth.client_id,
                     client_secret = r_auth.client_secret,
                     user_agent = r_auth.user_agent,
                     username = r_auth.username,
                     password = r_auth.password)



# Set parameters for Jokes subreddit query
submissions = reddit.subreddit('Jokes').hot(limit = 100)


# Set timeframe for joke scraping to one month
TODAY = datetime.today()

timestamp = time.mktime(TODAY.timetuple())
seconds_one_month = 2_628_288 * 100
one_month_timeframe = timestamp - seconds_one_month


# Create default dictionary, and then set the keys in the for loop for each required field
joke_dict = defaultdict(list)

for joke in submissions:
    if joke.score > 200:
        joke_dict['id'].append(joke.id)
        joke_dict['date'].append(datetime.utcfromtimestamp(joke.created).strftime('%Y-%m-%d'))
        joke_dict['title'].append(joke.title)
        joke_dict['body'].append((joke.selftext).replace('\\', ''))
        joke_dict['score'].append(joke.score)

# Convert the ListingGenerator from Praw to Datafame for feeding to Dynamo
jokes_scrape = pd.DataFrame(joke_dict)



# DYNAMO SECTION

# Create records of jokes in AWS
import boto3

# Set AWS resource type to dynamo
dynamodb = boto3.resource('dynamodb')

# Set AWS dynamo table by setting name of table in dynamo dash
table = dynamodb.Table('jokes_table')


# Iterate through dataframe (iterrows may be too slow for larger datasets, but fine for 100-at-a-time records)
with table.batch_writer() as batch:
    for index, row in jokes_scrape.iterrows():
        batch.put_item(
            Item = {
                'id': row['id'],
                'date': row['date'],
                'title': row['title'],
                'body': row['body'],
                'score': row['score']
    }
)

Unnamed: 0,id,date,title,body,score
0,i3mmsn,2020-08-05,r/jokes has a discord and you need to join!,Over 17k members! Come see reposts in real ti...,976
1,ics15s,2020-08-20,Reposts...,"r/Jokes has a search feature, input the title ...",2621
2,ju1u41,2020-11-14,My wife told me to take the spider out instead...,"Went out, had a few drinks. Nice guy. He's a w...",25504
3,ju6m9v,2020-11-15,My wife asked me which of her friends I would ...,Apparently I’m not supposed to pick two of them.,496
4,jtrw7y,2020-11-14,It's a 5 minute walk from my house to the pub....,The difference is staggering.,35323
5,ju304q,2020-11-14,Not saying my Ex was fat,But it took a year for my memory foam mattress...,431
6,ju0pop,2020-11-14,My uncle is like a good love story,Very touching,419
7,jtsxii,2020-11-14,What is the most expensive video streaming ser...,College.,3141
8,jtobw6,2020-11-14,3 nuns die and go to Heaven,"At the Pearly Gates, St. Peter tells them that...",14144
9,jtzz4r,2020-11-14,Two men at an airport,"First man says, ""I can't find my wife."" \n\nSe...",493
