In [1]:
# Installing Required Libraries
# =============================

!pip install googletrans
!pip install praw

In [2]:
# Importing Necessary Libraries
# =============================

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.types import FloatType
from googletrans import Translator

In [3]:
# Starting a Spark Session
# ========================

spark = SparkSession.builder.getOrCreate()

In [4]:
# Reading Training and Test Data from Tables into DataFrames
# ==========================================================
# Prior to running this code cell, the training data must be uploaded onto
# DataBricks as a table named "training_data" and the OOT test data must be
# uploaded as a table named "final_test_data".

# Training Data
df = sqlContext.table("training_data")
df.printSchema()

# Test Data
df_T = sqlContext.table("final_test_data")
df_T.printSchema()

In [5]:
# Keeping Only the Useful Columns in the DataFrames
# =================================================

# List of useful columns
useful_cols = ["author",\
               "author_cakeday",\
               "created_utc", \
               "brand_safe",\
               "can_gild",\
               "domain",\
               "permalink",\
               "is_crosspostable",\
               "no_follow",\
               "num_comments",\
               "over_18",\
               "subreddit_id",\
               "whitelist_status",\
               "suggested_sort",\
               "title",\
               "score"]

# Keeping only the useful columns on the training dataframe
df = df.select(useful_cols)
df.printSchema()

# Keeping only the useful columns on the test dataframe
df_T = df.select(useful_cols)
df_T.printSchema()

In [6]:
# Combining Training and Test DataFrames into One DataFrame for Processing
# ========================================================================

# Adding a column to each dataframe as an identifier (test or train data)
# prior to combining
df = df.withColumn("test", F.lit(False))
df_T = df_T.withColumn("test", F.lit(True))

# Combining the test and train dataframes
df = df.union(df_T)

# Drop rows with all null values
df = df.na.drop(how="all")

In [7]:
# Extracting Date Features from the created_utc Date Column and Comments Url from permalink Column
# ================================================================================================

# Importing libraries
from pyspark.sql.types import DateType, StringType

# Cast created_utc as a timestamp
df.registerTempTable("dfTbl")
df_0 = spark.sql("""
                     SELECT *, cast(created_utc as Timestamp) as create_date 
                     FROM dfTbl d
                """)

# Create new columns with day of the month and hour of the day
df_0 = df_0.withColumn("created_utc_day", F.dayofmonth(col("create_date")))
df_0 = df_0.withColumn("created_utc_hr", F.hour(col("create_date")))
df_0 = df_0.withColumn("commentsUrl", F.concat(F.lit("https://www.reddit.com"),col("permalink")))

# Select the most useful columns
clean_df0 = df_0.select("author",\
                     "author_cakeday",\
                     "create_date", \
                     "created_utc_day", \
                     "created_utc_hr", \
                     "brand_safe",\
                     "can_gild",\
                     "domain",\
                     "is_crosspostable",\
                     "no_follow",\
                     "num_comments",\
                     "over_18",\
                     "subreddit_id",\
                     "whitelist_status",\
                     "suggested_sort",\
                     "title",\
                     "score",\
                     "commentsUrl",\
                     "test")

In [8]:
# Web Scraping for Comments on Reddit
# ===================================

# Importing libraries
import praw
from praw.models import MoreComments

# Creating a praw.Reddit object with client credentials
reddit = praw.Reddit(\
                     client_id='tiFtv_pjPWhTsA',\
                     client_secret='_X9kmqGSwM6_KO8BWVErS6BjRnA',\
                     password='bigdata123',\
                     user_agent='testscript by /u/nileetho',\
                     username='nileetho'\
                    )

# Creating UDF for scraping comments
from pyspark.sql.types import StringType
def extractComments(num_comments, link):
  if num_comments > 0:
    try:
      submission = reddit.submission(url = link)
      commentbody = ""
      for comment in submission.comments:
        if (str(comment.body) == "[deleted]") or (str(comment.body) == "[removed]"):
          continue
        else:
          commentbody = commentbody + " " + str(comment.body)
    except:
      print("Error scraping comment.")
  else:
    commentbody = ""
  return commentbody
extractComments_udf = F.udf(extractComments, StringType())

# Adding a column to our dataframe to include scraped comments
clean_df1 = clean_df0.withColumn("comments", extractComments_udf(col("num_comments"), col("commentsUrl")))
clean_df1.printSchema()

In [9]:
# Cleaning Data, Translating Text and Applying Log Transforms on Skewed Numerical Data 
# ====================================================================================

# Filling null values for Boolean fields
clean_df1 = clean_df1.na.fill(False,["author_cakeday"])
clean_df1 = clean_df1.na.fill("no_ads",["whitelist_status"])
clean_df1 = clean_df1.na.fill("other", ["suggested_sort"])

# Applying log transform on skewed numerical data
numFields = ["num_comments", "score"]
for field in numFields:
  clean_df1 = clean_df1.withColumn("log_"+field, F.log(1+col(field)))
  
# Text cleaning
def clean_text(c):
  c = F.lower(c)
  c = F.regexp_replace(c, "^rt ", "")
  c = F.regexp_replace(c, "(https?\://)\S+", "")
  c = F.regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  return c

# Text translation
def translate(c):
  translator = Translator()
  en = translator.translate(c, dest='en')
  return en.text
translate_udf = udf(translate, StringType())

# Add new columns with cleaned and translated text fields
# Text fields with null values are replaces with blank space
textFields = ["title", "comments"]
for field in textFields:
  clean_df1 = clean_df1.withColumn(field+"regex", clean_text(col(field)))
  clean_df1 = clean_df1.withColumn(field+"Clean", translate_udf(col(field+"regex")))
  clean_df1 = clean_df1.na.fill("",[field+"Clean"])

# Keep useful columns
clean_df2 = clean_df1.select("author",\
                     "author_cakeday",\
                     "created_utc_day", \
                     "created_utc_hr", \
                     "brand_safe",\
                     "can_gild",\
                     "domain",\
                     "is_crosspostable",\
                     "no_follow",\
                     "num_comments", \
                     "log_num_comments",\
                     "over_18",\
                     "subreddit_id",\
                     "commentsUrl",\
                     "whitelist_status",\
                     "suggested_sort",\
                     "titleClean",\
                     "commentsClean",\
                     "score",\
                     "log_score",\
                     "test")

In [10]:
# Writing the data into a parquet file for ease of use in future
# ==============================================================

clean_df2.write\
         .format("parquet")\
         .option("header", "true")\
         .mode('overwrite')\
         .save("dbfs:/FileStore/df/train_OOT_data.parquet")