# Some simple data exploration for reddit data on pyspark

In [1]:
#from pyspark.sql import SQLContext
import pandas as pd
from pyspark.sql.functions import to_timestamp, from_unixtime
from pyspark.sql.types import IntegerType
import pyspark.sql
from pyspark import SparkConf, SparkContext

## This cell helps work around "maxresultsize" errors if they come up. 

In [2]:
#must first stop initial spark context
sc.stop()
#make a new configuration
conf = SparkConf()
#set the value we want 
conf = (conf.set('spark.driver.maxResultSize', '5g'))
#start a context with this configuration
sc = SparkContext(conf=conf)
#peak at it
sc

##  get an sql context for dealing with data from the generic spark context

In [3]:
sqlC = SQLContext(sc)

## read our comments data
- in this case already a parquet file in my working directory on HDFS

In [4]:
reddit = sqlC.read.parquet("all_comments.parquet")

## Basic exploration commanda below
- similar to pandas

In [5]:
reddit.printSchema()

root
 |-- approved_at_utc: string (nullable = true)
 |-- approved_by: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- banned_at_utc: string (nullable = true)
 |-- banned_by: string (nullable = true)
 |-- body: string (nullable = true)
 |-- body_html: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- collapsed_reason: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: bool

## `show()` and `head()` are similar, but not identical

In [None]:
reddit.show(5)

In [7]:
reddit.head(5)

[Row(approved_at_utc=None, approved_by=None, archived=None, author=u'LordTulio', author_cakeday=None, author_flair_css_class=u'darkbluebg yellow', author_flair_text=u'Child of Apollo', banned_at_utc=None, banned_by=None, body=u'"It\'s been a month and a half since I got here."', body_html=None, can_gild=None, can_mod_post=None, collapsed=None, collapsed_reason=None, controversiality=0, created=None, created_utc=1470136108, distinguished=None, downs=None, edited=u'false', gilded=0, id=u'd60v3ej', is_submitter=None, likes=None, link_id=u't3_4vp5tt', mod_reports=None, name=None, num_reports=None, parent_id=u't1_d60uz66', removal_reason=None, replies=None, report_reasons=None, retrieved_on=1473152164, saved=None, score=2, score_hidden=None, stickied=False, subreddit=u'CampHalfBloodRP', subreddit_id=u't5_34k5q', ups=2, user_reports=None),
 Row(approved_at_utc=None, approved_by=None, archived=None, author=u'Addicted2Craic', author_cakeday=None, author_flair_css_class=None, author_flair_text=

In [8]:
reddit.count()

3579502044

In [9]:
reddit.select('subreddit').distinct().count()

507893

In [10]:
reddit.select('subreddit').distinct().head(3)

[Row(subreddit=u'anime'),
 Row(subreddit=u'mistyfront'),
 Row(subreddit=u'ukraina')]

## modifying or creating columns
- spark dataframes are immutable
- the way around this is to use the `withColumn()` function

- Still having trouble getting data conversion to work without errors, so for now just converting to integers and working with time as unix epoch

In [6]:
reddit=reddit.withColumn("created_utc", reddit["created_utc"].cast(IntegerType()))

In [14]:
# comments since 1/1/2017
last_year = reddit.filter(reddit.created_utc > 1483228800)
last_year.count()

716091653

## group by to count posts per subreddit 

In [15]:
tmp = last_year.groupby('subreddit').count()

## `sort()` to show the most popular subreddits

In [16]:
tmp.sort('count', ascending=False).show(40)

+--------------------+--------+
|           subreddit|   count|
+--------------------+--------+
|           AskReddit|46370353|
|            politics|16209533|
|          The_Donald|11044882|
|                 nba| 9054604|
|           worldnews| 7704001|
|                news| 6921776|
|RocketLeagueExchange| 6232113|
|                 nfl| 5398420|
|     leagueoflegends| 5392100|
|                pics| 5391692|
|              soccer| 5293575|
|               funny| 5166878|
|       SquaredCircle| 5102253|
|              videos| 4688483|
|              gaming| 4338802|
|       todayilearned| 4310130|
|           Overwatch| 4232777|
|              hockey| 4097310|
|              movies| 3673653|
|        pcmasterrace| 3369668|
|                gifs| 3135764|
|      NintendoSwitch| 2999216|
|      Showerthoughts| 2852968|
|     GlobalOffensive| 2799473|
|               DotA2| 2763965|
|               anime| 2697897|
|       relationships| 2671069|
|                 MMA| 2580852|
|      D