# Project 3: Web APIs & NLP
## Matt Reed
### Data Collection

In [2]:
import pandas as pd
import requests
import time

#### Testing out accessing data through the API

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [44]:
params = {
    'subreddit': 'talesfromtechsupport',
    'size': 10,
#     'before': before,
    'score': '>600',
    'fields': ['author', 'title', 'selftext', 'created_utc', 'score', 'subreddit'],
    'over_18': False,
    'is_video': False,
    'locked': False,
    'stickied': False,
}

In [45]:
subreddits = ['talesfromtechsupport', 'talesfromretail']

In [46]:
res = requests.get(url, params)

In [47]:
res.status_code

200

In [48]:
data = res.json()

In [49]:
posts = data['data']

In [50]:
len(posts)

10

In [51]:
df = pd.DataFrame(posts)

In [52]:
df['score']

0    1183
1    1634
2     870
3     935
4     682
5     944
6     904
7     696
8    2323
9     608
Name: score, dtype: int64

In [53]:
df.columns

Index(['author', 'created_utc', 'score', 'selftext', 'subreddit', 'title'], dtype='object')

In [54]:
# df[df['author'] == '[deleted]']
df['title'].unique()

array(['"Where is the router?"', 'With users like this...',
       'Math...what a concept',
       "I'll have you know I've been working on computers for 10 years ....",
       'We need you to tell us our current password. No not reset tell us. We must have that exact password.',
       'User needs help not having to do her job?',
       "You have cancer and almost passed out waiting? Then you shouldn't be here at all.",
       'The joys of the bowling desk', 'Worst user ever...',
       'Try troubleshooting a little further before you call us next time'],
      dtype=object)

In [55]:
len(df)

10

In [56]:
before = int(time.time())

In [57]:
params = {
    'subreddit': 'talesfromtechsupport',
    'size': 10,
    'before': before,
    'score': '>600',
    'fields': ['author', 'title', 'selftext', 'created_utc', 'score', 'subreddit'],
    'over_18': False,
    'is_video': False,
    'locked': False,
    'stickied': False,
}

# use the requests to get the response
res = requests.get(url, params)

In [58]:
res.json()

{'data': [{'author': 'SonDontPlay',
   'created_utc': 1638946522,
   'score': 1183,
   'selftext': 'A General Officer calls in for urgent tech support, he has a conference call with other senior leaders in 45 minutes and he\'s unable to connect to the internet. Initial attempts are made at diagnosing the problem remotely, however he insists a technician be sent over right away. Since he\'s got stars he gets what he wants a technician is dispatched.\n\nTech arrives at residence, immediately notices there is no connection to the computer. Tech asks the General where the router is, General is a bit confused, Tech explains the router is the the box that connects to the internet. General says "Ah its in the living room"\n\nTech and General go into living room, the router is not where its supposed to be. Tech asks the General where is the router. The General is unsure, says maybe his wife knows. He has to call the wife as she\'s not present. \n\nGeneral is calling the wife, wife answers "Hon

In [59]:
# turn the response into JSON
data = res.json()

# turn the JSON into a DataFrame
df = pd.DataFrame(data['data'])

In [62]:
# Structure from API lesson

def get_subreddit_data(url, subreddit, upvote_thresh, min_n_data=200, n_size=50):
    
    dfs = pd.DataFrame()

    # set before to be current time
    before = int(time.time())

    # initialize counter
    counter = 0

    while len(dfs) < min_n_data:
        print(subreddit, counter, len(dfs))

        # create params: before, subreddit, size
        params = {
            'subreddit': subreddit,
            'size': n_size,
            'before': before,
            'score': f'>{upvote_thresh}',
            'fields': ['author', 'title', 'selftext', 'created_utc', 'score', 'subreddit'],
            'over_18': False,
            'is_video': False,
            'locked': False,
            'stickied': False,
        }

        # use the requests to get the response
        status = False
        status_counter = 0
        while status == False:
            status_counter += 1
            try: 
                res = requests.get(url, params)
                status = True
            except:
                print(f'Request attempt {status_counter} failed.')
                time.sleep(3)

        # turn the response into JSON
        data = res.json()

        # turn the JSON into a DataFrame
        df = pd.DataFrame(data['data'])[['author', 'title', 'selftext', 'subreddit', 'created_utc']]

        # remove any deleted posts
        df = df.drop(df[df['author'] == '[deleted]'].index)
        
        
        # add posts DataFrame to dfs
        dfs = pd.concat([dfs, df], ignore_index=True)

        # set before to be the timestamp of the last post
        before = df['created_utc'].values[-1]
        
        print(f'{len(dfs)} records collected')
        counter += 1
        time.sleep(3)

    return dfs

In [64]:
dfs = get_subreddit_data(url, subreddits[0], upvote_thresh=600, min_n_data=1000, n_size=100)

talesfromtechsupport 0 0
99 records collected
talesfromtechsupport 1 99
199 records collected
talesfromtechsupport 2 199
299 records collected
talesfromtechsupport 3 299
399 records collected
talesfromtechsupport 4 399
495 records collected
talesfromtechsupport 5 495
593 records collected
talesfromtechsupport 6 593
693 records collected
talesfromtechsupport 7 693
793 records collected
talesfromtechsupport 8 793
893 records collected
talesfromtechsupport 9 893
993 records collected
talesfromtechsupport 10 993
1093 records collected


In [71]:
df_tfts = dfs.copy()

In [76]:
df_tfts.shape

(1093, 5)

In [72]:
df_tfts.head()

Unnamed: 0,author,title,selftext,subreddit,created_utc
0,SonDontPlay,"""Where is the router?""",A General Officer calls in for urgent tech sup...,talesfromtechsupport,1638946522
1,talbourne,With users like this...,I'll be staying at this company for a long tim...,talesfromtechsupport,1638904166
2,edhands,Math...what a concept,"Back in 2009, our company purchased a horribly...",talesfromtechsupport,1629488107
3,12altoids34,I'll have you know I've been working on comput...,Thats how the call started. \n\nThen she went ...,talesfromtechsupport,1623803260
4,Gingrpenguin,We need you to tell us our current password. N...,For backstory i worked for a small tech compan...,talesfromtechsupport,1623769022


In [73]:
df_tfr = get_subreddit_data(url, subreddits[1], upvote_thresh=600, min_n_data=1000, n_size=100)

talesfromretail 0 0
99 records collected
talesfromretail 1 99
199 records collected
talesfromretail 2 199
299 records collected
talesfromretail 3 299
399 records collected
talesfromretail 4 399
499 records collected
talesfromretail 5 499
598 records collected
talesfromretail 6 598
698 records collected
talesfromretail 7 698
798 records collected
talesfromretail 8 798
897 records collected
talesfromretail 9 897
997 records collected
talesfromretail 10 997
1097 records collected


In [75]:
df_tfr.shape

(1097, 5)

In [74]:
df_tfr.head()

Unnamed: 0,author,title,selftext,subreddit,created_utc
0,MattyDGames,Your know card doesn’t work? Let’s grab all th...,"Ok ok, today I had a customer come in with a c...",TalesFromRetail,1638923563
1,MissMissieFatCat,Woman proves herself wrong and storms out of s...,I work in a family owned pet food/supply store...,TalesFromRetail,1623770945
2,Knever,Coworkers try to steal $200 worth of groceries...,"I worked at a grocery store with a guy who, on...",TalesFromRetail,1623707257
3,Mr_jon3s,Moves parking cone gets mad that I yelled at h...,Having some work done on the parking lot so we...,TalesFromRetail,1623594705
4,pessimist_kitty,Getting real tired of people thinking we're ma...,This just happened this morning and I had to s...,TalesFromRetail,1614891256


In [77]:
df_tfts.to_csv(f'../data/{int(time.time())}_TalesFromTechSupport.csv', index=False)

In [78]:
df_tfr.to_csv(f'../data/{int(time.time())}_TalesFromRetail.csv', index=False)