# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Reddit-Crawl-Script" data-toc-modified-id="Reddit-Crawl-Script-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Reddit Crawl Script</a></div><div class="lev1 toc-item"><a href="#test-=-rd.subreddit('Lubbock')" data-toc-modified-id="test-=-rd.subreddit('Lubbock')-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>test = rd.subreddit('Lubbock')</a></div>

# Reddit Crawl Script

Script to crawl through Reddit to figure out the crossover of users in different subreddits.

One assumption/approximation to figureout what subreddits users are subscribed to is to take a top number of subs a user has posted to and assume they are subscribed to them.

In [1]:
import praw
import reddit_creds as rc # Python file that contains my sensitive credentials 
                          # and other info on some of the objects in PRAW.
import sqlite3 as s3
import pickle
import pprint as pr
import timeit as t 
import time
import datetime as dt

The methods and attributes for the main objects in PRAW and how to access them as dictionaries. (Assuming `reddit_creds` is NOT imported as any other alias.)

- Reddit `reddit_creds.prawReddit`
- Subreddit `reddit_creds.prawSubreddit`
- Submission `reddit_creds.prawSubmission`
- Redditor `reddit_creds.prawRedditor`

In [None]:
## Uncomment any of the following to look at the attributes and methods available.
# rc.prawReddit
rc.prawSubreddit
# rc.prawSubmission
# rc.prawRedditor

In [2]:
# Reddit Instance
uag = 'reddit_crawl_script:v1.0.0 (/u/Nazi_Ganesh)'
rd = praw.Reddit(check_for_updates = True,
                 client_id = rc.cid,
                 client_secret = rc.sec,
                 user_agent = uag,
                 username = rc.usr,
                 password = rc.psw)

redd_DB = 'redditCrawlDB.db'

In [None]:
# SQLite3 Connection Initialization
conn = s3.connect(redd_DB)
c = conn.cursor()

# Name of table in the database: redditCrawlDB_1_0 (12/26/16)
c.execute("CREATE TABLE IF NOT EXISTS "
           "redditCrawlDB_1_0(userID TXT PRIMARY KEY UNIQUE, "
                             "actCreated NUM NOT NULL, "
                             "cKarma INT NOT NULL, "
                             "lKarma INT NOT NULL, "
                             "lSubm TXT NOT NULL, "
                             "sSubm TXT NOT NULL)")

Psuedo-code:

Take in a submission
- Find `author` of post
    - Check to see if user is already in `database`.
    - If NOT in `database`
        - Get userinfo
            - `User ID`, `Account Created`, `Comment Karma`, & `Link Karma`.
        - Create pre-defined (1000) `list` called `lSubm`
        - Go to `submitted` and sort as "top".
            - Get top 5000 submissions.
                - For each `submission`
                    - Extract `subreddit_id` and append to `lSubm`.
                - `lSubm` is converted into a `set` and stored as `sSubm`.
        - Return `usrID`, `cUTC`, `cKar`, `lKar`, `lSubm`, & `sSubm`.
    - Else 
        - Return `False`
        

Functions

In [3]:
conn = s3.connect(redd_DB)
c = conn.cursor()

def db_usrChk(usr_ID):
    c.execute("SELECT userID FROM redditCrawlDB_1_0 WHERE userID=?", (usr_ID,))
    uTst = len(c.fetchall())
    if uTst == 0:
        return True
    else:
        return False
    
def toStr(list_or_set_in):
    return ','.join(list_or_set_in)
    
def toLst(str_in):
    return str_in.split(',')

def toSet(str_in):
    return set(str_in.split(','))

def subm_CHK(submission_in, subm_lim = 5000):
    redd_usr = rd.redditor(str(submission_in.author))
    authorID = redd_usr.id;
    
    if db_usrChk(authorID):
        # Grabbing User Info
        usrID = authorID
        cUTC = dt.datetime.fromtimestamp(redd_usr.created_utc).strftime('%Y-%m-%d')
        cKar = redd_usr.comment_karma
        lKar = redd_usr.link_karma
        
        # Grabbing the top 5000 submissions by user.
        lSubm = []
        subms = redd_usr.submissions.top(limit=subm_lim)
        
        for subm in subms:
            lSubm.append(subm.subreddit_id)
            
        sSubm = toStr(set(lSubm))
        lSubm = toStr(lSubm)
        
        return usrID, cUTC, cKar, lKar, lSubm, sSubm
    else:
        return False
    

Check /r/All to scrape.

In [None]:
start = time.time()
conn = s3.connect(redd_DB)
c = conn.cursor()

st_subr = time.time()
tm_subr = []

for subr_pop in rd.subreddits.popular():
    sr_subms = subr_pop.hot(limit=100)
    
    st_subm = time.time()
    tm_subm = []
    
    for sr_subm in sr_subms:
        db_entry = subm_CHK(sr_subm,10000)
        if db_entry:
            c.execute(
                      "INSERT INTO redditCrawlDB_1_0 (userID, actCreated, cKarma, lKarma, lSubm, sSubm) VALUES(?,?,?,?,?,?)", 
                      tuple(db_entry[ii] for ii in range(6))
                     )
            conn.commit()
        
        en_subm = time.time() - st_subm
        tm_subm.append([en_subm, sr_subm.title, sr_subm.id, sr_subm.score])
    
    en_subr = time.time() - st_subr
    tm_subr.append([en_subr, subr_pop.title,  'ov18: ' + str(subr_pop.over18), tm_subm])

time.time() - start

In [4]:
#tm_12271058_late = tm_subr
#tm_data = [tm_1227347, tm_1227511, tm_1227626, tm_1227749_late ,tm_1227926_late]
#pickle_out = open("tm_data.pickle","wb")
#pickle.dump(tm_data, pickle_out)
#pickle_out.close()

pick_in = open("tm_data.pickle", "rb")
test = pickle.load(pick_in)
test

[[[45.54911255836487,
   'Ask Reddit...',
   'ov18: False',
   [[1.4090323448181152,
     '[Megathread] What would you like to ask Reddit about the holidays? megathread',
     '5k8ywe',
     255],
    [1.7360289096832275,
     'People who decided to stop using straws, what was the final straw?',
     '5kg950',
     20544],
    [2.0810108184814453, 'Furries of reddit, why?', '5keuzq', 11552],
    [2.401029586791992,
     'What was the most successful internet troll of all time?',
     '5kfyjx',
     2455],
    [2.716529130935669,
     'What is socially acceptable in America which is not anywhere else?',
     '5kdvdi',
     9673],
    [3.0242486000061035,
     "People who've deleted Facebook, what was the final straw?",
     '5kdfsf',
     19556],
    [3.357882499694824,
     'Parents of kids that did not succeed as adults (jail, drugs, etc): how do you feel about a decision and experience of being a parent?',
     '5kdwhr',
     3101],
    [3.6845951080322266,
     "You've engineered a 

In [None]:
!pip install https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tarball/master
!pip install jupyter_nbextensions_configurator
!jupyter contrib nbextension install --user
!jupyter nbextensions_configurator enable --user

In [None]:
5+5

In [None]:
start = time.time()
#print(len(tuple(ii for ii in rd.subreddits.popular())))
pr.pprint(tuple(db_entry[ii] for ii in range(6)))
time.time() - start

In [None]:
 c.execute(
                  "INSERT INTO redditCrawlDB_1_0 (userID, actCreated, cKarma, lKarma, lSubm, sSubm) VALUES(?,?,?,?,?,?)", 
                  tuple(db_entry[ii] for ii in range(6))
                 )

In [None]:
time.time()
tuple(subm_CHK(rd.subreddits.popular().next().hot(limit=5).next())[ii] for ii in range(6))


In [None]:
redd_usr = rd.redditor('Nazi_Ganesh')
pr.pprint(redd_usr.id)
pr.pprint(redd_usr.name)
pr.pprint(redd_usr.created_utc)
pr.pprint(redd_usr.comment_karma)
pr.pprint(redd_usr.link_karma)

print(list(vars(redd_usr).keys()))

In [None]:
redd_usr = []

dict1 = {'Name': 'Suren', 'Age': 26, 'Work': 'Student', 'Salary': 22000}
dict2 = {'Name': 'Shiva', 'Age': 32, 'Work': 'Plumber', 'Salary': 45000}

redd_usr.append(dict1)
redd_usr.append(dict2)

pr.pprint(redd_usr)

'Suren' in redd_usr['Name']

In [None]:
def crt(x=5):
    if x > 5:
        y = x + 3
        z = x - 3
        return y, z
    else:
        return False

for ii in range(10):
    if crt(ii):
        xx = crt
        pr.pprint(crt(ii))
    else:
        pr.pprint('Not True')

In [None]:
tbl = 'redditCrawlDB'
"CREATE TABLE IF NOT EXISTS {}(userID TXT, actCreated INT, cKarma INT, lKarma INT, lSubm INT, sSubm INT)".format(tbl)

In [None]:
st = time.time()
for subm in sub_test.hot(limit=2000):
    pr.pprint(subm.title)
    pr.pprint(subm.score)
    pr.pprint(subm.id)
    pr.pprint(subm.author)

pr.pprint(vars(subm))
time.time() - st

In [None]:
poo = foo = 0
[foo += 1, poo, 'a']


In [None]:
def po(x=5):
    if x>5:
        z = x + 5
        y = x + 10
        return y,z
    else:
        return False
    
y = po(10)
if po(10):
    print('If')
    print(y[0], y[1])
else:
    print('Else')

# test = rd.subreddit('Lubbock')
test.new
vars(test)

$\frac{\Delta \vec{p}}{\Delta t} = \vec{F}$