# Notebook for selecting all comments ever in a particular set of subreddits

In [1]:
from pyspark.sql import SQLContext
import pandas as pd
from pyspark.sql.functions import *
import pyspark.sql

In [2]:
sqlC = SQLContext(sc)

In [3]:
reddit = sqlC.read.parquet("all_comments.parquet")

In [4]:
reddit.printSchema()

root
 |-- approved_at_utc: string (nullable = true)
 |-- approved_by: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- banned_at_utc: string (nullable = true)
 |-- banned_by: string (nullable = true)
 |-- body: string (nullable = true)
 |-- body_html: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- collapsed_reason: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: bool

## Select just the columns we want to work with
- specifically, exclude columns that have sub-columns to them because they're a pain to work with if we don't need their data.

In [5]:
reddit = reddit[['approved_at_utc',  'approved_by', 'archived', 'author', 
                 'author_cakeday', 'author_flair_css_class', 'author_flair_text',
                 'banned_at_utc', 'banned_by',  'body',  'can_gild',
                 'can_mod_post', 'collapsed',  'collapsed_reason', 'controversiality',
                 'created', 'created_utc', 'distinguished', 'downs',  'edited',
                 'gilded',  'id', 'is_submitter', 'likes', 'link_id', 'name', 
                 'num_reports', 'parent_id', 'removal_reason', 'replies', 
                 'report_reasons', 'retrieved_on', 'saved', 'score', 'score_hidden',
                 'stickied', 'subreddit', 'subreddit_id', 'ups']]

## Replace all the newlines in the comment text with spaces.
- saves a huge amount of headache with file formats and quoting
- yes, some information is lost this way. 

In [8]:
reddit = reddit.withColumn('body', regexp_replace('body', '\n', ' '))

## get a count of how many posts are in each subreddit
- this can help us decide which ones to download

In [None]:
tmp = reddit.groupby('subreddit').count()
tmp.show(10)

## List subreddits of interest, see their post count
- case sensitive!

In [13]:
li = ['politics', 
      'Republican', 'republicans', 'CollegeRepublicans', 'askaconservative', 
      'askarepublican',
      'Democrat', 'democrats', 'CollegeDemocrats', 'AskDemocrats',
      'GreenParty',
      'Libertarian', 'AskLibertarians',
      'Liberal', 'AskALiberal', 
      'progressive',
      'socialism', 'AskASocialist',
      'demsocialist',
      'communism', 'FULLCOMMUNISM', 'Communist', 'communists',
      'votethirdparty',
      'NeutralPolitics'
     ]

tmp.filter(tmp.subreddit.isin(li)).sort('count', ascending=False).show()

+----------------+--------+
|       subreddit|   count|
+----------------+--------+
|        politics|61748124|
|     Libertarian| 3990229|
|       socialism| 1110657|
|   FULLCOMMUNISM|  494840|
| NeutralPolitics|  342213|
|      Republican|  333246|
|     progressive|  247961|
|       democrats|  172526|
|         Liberal|  152893|
|       communism|  123248|
|askaconservative|   80139|
|     AskALiberal|   50169|
|      GreenParty|   15215|
| AskLibertarians|   12489|
|     republicans|   11949|
|    demsocialist|    5428|
|    AskDemocrats|    4608|
|      communists|    1800|
|        Democrat|    1462|
|  votethirdparty|     408|
+----------------+--------+
only showing top 20 rows



## Select and save the data from each subrddit separately
- First, filter (select where) the data so that we only have the subreddits we care about. Tell pyspark to cache that result for future reuse. This makes the rest of the searches after it faster.
    - pyspark actually uses lazy evaluation, so this doesn't run until the first call to `short_reddit.filter()`. That means the first loop is slow, but the loops after it go faster. 
- Save everything to a tsv file, one per subreddit
    - note the headers are off by default
    - note this actually saves to a folder full of tsv files (one for each partition). That's a spark thing good for spark but bad for other programs. We'll have to merge them outside this script. (There are ways to merge them here, but if the file is bigger than the JVM memory it throws errors. Easier to fix outside where that doesn't happen.)

In [14]:
print("making short...")
short_reddit = reddit.filter(reddit.subreddit.isin(li))
short_reddit.cache()

for l in li:
    print 'making', l
    result = short_reddit.filter(short_reddit.subreddit == l)
    result.write.mode('overwrite').csv(l+'.tsv', sep='\t', header=True)
print('done!')

making short...
making politics
making Republican
making republicans
making CollegeRepublicans
making askaconservative
making askarepublican
making Democrat
making democrats
making CollegeDemocrats
making AskDemocrats
making GreenParty
making Libertarian
making AskLibertarians
making Liberal
making AskALiberal
making progressive
making socialism
making AskASocialist
making demsocialist
making communism
making FULLCOMMUNISM
making Communist
making communists
making votethirdparty
making NeutralPolitics
done!


## Same as above, but with a different list of subreddits

In [15]:
li = ['cats', 'Meow_irl', 'CatsonGlass', 'CatLoaf', 'Kittens', 'CatGifs', 'StartledCats',
      'StuffOnCats', 'CatsStandingUp',
      'dogs', 'dogpictures', 'dogswithjobs', 'Dogtraining', 'DOG', 'puppies', 
      'dogswearinghats'
     ]

tmp.filter(tmp.subreddit.isin(li)).sort('count', ascending=False).show()

+---------------+-------+
|      subreddit|  count|
+---------------+-------+
|           dogs|1504097|
|           cats|1479661|
|    Dogtraining| 319811|
| CatsStandingUp| 299892|
|    dogpictures| 187653|
|   StartledCats|  90970|
|        CatGifs|  23750|
|        puppies|  21052|
|dogswearinghats|   7765|
|    StuffOnCats|   5432|
|            DOG|   3092|
|   dogswithjobs|   1046|
+---------------+-------+



In [16]:
print("making short...")
short_reddit = reddit.filter(reddit.subreddit.isin(li))
short_reddit.cache()

for l in li:
    print 'making', l
    result = short_reddit.filter(short_reddit.subreddit == l)
    result.write.mode('overwrite').csv(l+'.tsv', sep='\t', header=True)
    #df = result.toPandas()
    #df.to_csv(l+'.tsv', sep='\t', index=False)
print('done!')

making short...
making cats
making Meow_irl
making CatsonGlass
making CatLoaf
making Kittens
making CatGifs
making StartledCats
making StuffOnCats
making CatsStandingUp
making dogs
making dogpictures
making dogswithjobs
making Dogtraining
making DOG
making puppies
making dogswearinghats
done!
