## Scraper for r/jokes subreddit that will be used to train NLP model

In [6]:
from collections import defaultdict
import datetime
from datetime import datetime
import pandas as pd
import praw
import time

In [30]:
# Test request of jokes subreddit, hot posts, limited
import r_auth

reddit = praw.Reddit(client_id = r_auth.client_id,
                     client_secret = r_auth.client_secret,
                     user_agent = r_auth.user_agent,
                     username = r_auth.username,
                     password = r_auth.password)



# Set parameters for Jokes subreddit query
submissions = reddit.subreddit('Jokes').top(limit = 100)


# Set timeframe for joke scraping to one month
TODAY = datetime.today()

timestamp = time.mktime(TODAY.timetuple())
seconds_one_month = 2_628_288 * 100
one_month_timeframe = timestamp - seconds_one_month


# Create default dictionary, and then set the keys in the for loop for each required field
joke_dict = defaultdict(list)

for joke in submissions:
    if joke.created > one_month_timeframe and joke.score > 2_000:
        joke_dict['id'].append(joke.id)
        joke_dict['date'].append(datetime.utcfromtimestamp(joke.created).strftime('%Y-%m-%d'))
        joke_dict['title'].append(joke.title)
        joke_dict['body'].append((joke.selftext).replace('\\', ''))
        joke_dict['score'].append(joke.score)

        
jokes_scrape = pd.DataFrame(joke_dict)

jokes_scrape.head()

Unnamed: 0,id,date,title,body,score
0,f6lii3,2020-02-20,Sad News: The founder of /r/jokes has passed away,"RIP Larry Tesler, the UI designer that created...",142734
1,7ekt23,2017-11-22,Calm down about the Net Neutrality thing...,Paying additional money to access certain site...,136357
2,6lfqep,2017-07-06,V,V\n\n*Edit: seems like the ctrl key on my keyb...,106418
3,coj45m,2019-08-10,If your surprised that Jeffrey Epstein commite...,Imagine how surprised he must have been.\n\nEd...,103656
4,9mf1cz,2018-10-08,A new Navy recruit has his first day on the su...,"He speaks with the officer, who assigns him hi...",98264
...,...,...,...,...,...
95,iwh8dh,2020-09-21,CAN ADMINS OF THIS SUBREDDIT REDDIT DO A BETTE...,"WE HAVE A NEW MEMBER, A WOMAN. SHE’S BEEN PRIV...",52787
96,5pds29,2017-01-22,Looks like Trump is keeping up Michelle's idea...,One day in office and he has thousands of peop...,52761
97,9asy4v,2018-08-28,If Queen Elizabeth accidentally farts during d...,Noble gases should have no reaction.,52760
98,jqtfx4,2020-11-09,*Nsfw* The military is cutting staff and decid...,"All of them are old, grizzled men who had seen...",52752


### Test script for creating table in dynamo instance

In [5]:
# # Create the DynamoDB table.
# table = dynamodb.create_table(
#     TableName='users',
#     KeySchema=[
#         {
#             'AttributeName': 'username',
#             'KeyType': 'HASH'
#         },
#         {
#             'AttributeName': 'last_name',
#             'KeyType': 'RANGE'
#         }
#     ],
#     AttributeDefinitions=[
#         {
#             'AttributeName': 'username',
#             'AttributeType': 'S'
#         },
#         {
#             'AttributeName': 'last_name',
#             'AttributeType': 'S'
#         },
#     ],
#     ProvisionedThroughput={
#         'ReadCapacityUnits': 5,
#         'WriteCapacityUnits': 5
#     }
# )

# # Wait until the table exists.
# table.meta.client.get_waiter('table_exists').wait(TableName='users')

# # Print out some data about the table.
# print(table.item_count)
##should return 0

0


### Script to put item in dynamo table

In [32]:
# Create records of jokes in AWS
import boto3

# Set AWS resource type to dynamo
dynamodb = boto3.resource('dynamodb')

# Set AWS dynamo table by setting name of table in dynamo dash
table = dynamodb.Table('jokes_table')


# Iterate through dataframe (iterrows may be too slow for larger datasets, but fine for 100-at-a-time records)
with table.batch_writer() as batch:
    for index, row in jokes_scrape.iterrows():
        batch.put_item(
            Item = {
                'id': row['id'],
                'date': row['date'],
                'title': row['title'],
                'body': row['body'],
                'score': row['score']
            }
        )