In [1]:
# Configuration for sparknlp - third party library
# # # Use DBR 6.5 ML and spark 2.4.5
# # # Install this on maven: com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.4
# # # Install spark-nlp jar on PYPI
# =============================

In [2]:
# Importing Necessary Libraries
# =============================

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.types import FloatType


import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf


import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.pretrained import PretrainedPipeline

In [3]:
# Loading DataFrame from Pre-saved Parquet File
# =============================================

# Importing libraries
from pyspark.sql.types import *

# Defining the schema
schema = StructType([\
                      StructField("author",StringType(),True),\
                      StructField("author_cakeday",BooleanType(),False),\
                      StructField("created_utc_day",IntegerType(),True),\
                      StructField("created_utc_hr",IntegerType(),True),\
                      StructField("brand_safe",BooleanType(),True),\
                      StructField("can_gild",BooleanType(),True),\
                      StructField("domain",StringType(),True),\
                      StructField("is_crosspostable",BooleanType(),True),\
                      StructField("no_follow",BooleanType(),True),\
                      StructField("num_comments",LongType(),True),\
                      StructField("log_num_comments",DoubleType(),True),\
                      StructField("over_18",BooleanType(),True),\
                      StructField("subreddit_id",StringType(),True),\
                      StructField("commentsUrl",StringType(),True),\
                      StructField("whitelist_status",StringType(),False),\
                      StructField("suggested_sort",StringType(),False),\
                      StructField("titleClean",StringType(),True),\
                      StructField("commentsClean",StringType(),False),\
                      StructField("score",LongType(),True),\
                      StructField("log_score",DoubleType(),True),
                      StructField("test",BooleanType(),False)\
                    ])

# Reading the file into a dataframe
# 2 filepaths are stated below 
# - one for training data and initial test set
# - one for training data and the OOT test set

# Filepath for training tata and initial test set
# filePath = "dbfs:/FileStore/tables/df/train_test_data.parquet"

# Filepath for training tata and final OOT test set
filePathOOT = "dbfs:/FileStore/tables/df/train_OOT_data.parquet"

# Choose the correct filepath from above to read into the dataframe
clean_df = spark.read.format("parquet").option("header", "true").schema(schema).load(filePathOOT)

# Drop any row having any column as "null"
clean_df = clean_df.na.drop(how="any")

# Additional drop of rows as the focus is on the wordembeddings of title.
clean_df = clean_df.where(F.length("titleClean")!=0)

# Separating training dataframe
train_df = clean_df.where(col("test")==False)

In [4]:
# Undersampling the majority data based on score 
# because a vast majority of the training data has a score of 0
# =============================================================

# Defining a function to undersample
def removeMajority(df, targetcol="scoreOutlier", targetval=0.0, removal=0.9):
  sample_df = df.where((col(targetcol)==targetval) & (col("test")==False))
  sample_df2 = df.where((col(targetcol)!=targetval) & (col("test")==False))
  sample_df = sample_df.sampleBy(col=targetcol, fractions = {targetval: 1-removal}, seed = 69)
  sample_df2 = sample_df2.union(sample_df)
  sample_df3 = df.where(col("test")==True)
  sample_df3 = sample_df3.union(sample_df2)
  return sample_df3

# Create a new training dataframe with majority undersampled
train_df2 = removeMajority(df = train_df, targetcol = "score", targetval = 0, removal = 0.75)

# Printing the change in number of training datapoints due to undersampling
print("No. of training datapoints before undersampling = {}".format(train_df.agg(F.count("test")).first()[0]))
print("No. of training datapoints after undersampling = {}".format(train_df2.agg(F.count("test")).first()[0]))

# Printing the change in skewness of data as a result of undersampling
print("Skewness of log_score data before undersampling = {}".format(train_df.agg(F.skewness("log_score")).first()[0]))
print("Skewness of log_score data after undersampling = {}".format(train_df2.agg(F.skewness("log_score")).first()[0]))
print("Skewness of log_num_comments data before undersampling = {}".format(train_df.agg(F.skewness("log_num_comments")).first()[0]))
print("Skewness of log_num_comments data after undersampling = {}".format(train_df2.agg(F.skewness("log_num_comments")).first()[0]))

In [5]:
# Removing Outliers Using Z-Score
# We are using the log_num_comments and log_score features to determine outliers
# Datapoints in which both these feature values fall outside the mean +/- 3*std_dev range are discarded
# =====================================================================================================

# Computing quartiles and z-scores for the 2 chosen numerical data fields - log_score and log_num_comments
from pyspark.sql.types import BooleanType
qtlScore = train_df2.where(col("test")==False).approxQuantile("log_score", [0.00, 0.75], 0.0)
qtlComments = train_df2.where(col("test")==False).approxQuantile("log_num_comments", [0.00, 0.75], 0.0)
stdevScore, meanScore = train_df2.select(F.stddev("log_score"), F.mean("log_score")).first()
stdevComments, meanComments = train_df2.select(F.stddev("log_num_comments"), F.mean("log_num_comments")).first()

# Defining an outlier detection function based on z-score
def outlier(comments, score):
  Outlier = (score>meanScore+(3*stdevScore)) and (comments>meanComments+(3*stdevComments))
  if Outlier:
    return True
  else:
    return False
outlier_udf = udf(outlier, BooleanType())

# Defining an outlier detection function based on IQR
def outlierIQR(comments, score):
  Outlier = (score>qtlScore[1]) and (comments>qtlComments[1])
  if Outlier:
    return True
  else:
    return False
outlierIQR_udf = udf(outlierIQR, BooleanType())

# Computing outliers based on
df3 = train_df2.withColumn("Outlier", outlier_udf(col("log_num_comments"), col("log_score")))
df3IQR = train_df2.withColumn("Outlier", outlierIQR_udf(col("log_num_comments"), col("log_score")))
df4Zscore = df3.where(col("Outlier")==False)# | (col("Outlier")==True)) #can remove outliers if needed

# Printing the change in number of training datapoints due to removal of outliers
print("Choosing Z-score to detect and eliminate outliers...")
print("No. of training datapoints before dropping outliers = {}".format(df3.agg(F.count("test")).first()[0]))
print("No. of training datapoints after dropping outliers = {}".format(df4Zscore.agg(F.count("test")).first()[0]))

# Printing the change in skewness of data as a result of dropping the outliers
print("Skewness of log_score data before dropping outliers = {}".format(df3.agg(F.skewness("log_score")).first()[0]))
print("Skewness of log_score data after dropping outliers = {}".format(df4Zscore.agg(F.skewness("log_score")).first()[0]))
print("Skewness of log_num_comments data before dropping outliers = {}".format(df3.agg(F.skewness("log_num_comments")).first()[0]))
print("Skewness of log_num_comments data after dropping outliers = {}".format(df4Zscore.agg(F.skewness("log_num_comments")).first()[0]))

In [6]:
# Splitting Training DataFrame into Training and Validation Sets
# ==============================================================

# Setting the ratio of training set
trainSplitRatio = 0.7

# Splitting the majority data into training and validation sets
train_set0, val_set0 = df4Zscore.where(col("score")==0).randomSplit([trainSplitRatio, 1-trainSplitRatio], seed = 420)

# Splitting the remaining (minority) data into training and validation sets
train_set, val_set = df4Zscore.where(col("score")!=0).randomSplit([trainSplitRatio, 1-trainSplitRatio], seed = 125)

# Combining the majority and minority datasets to produce the final training and validation datasets
train_dataset = train_set.union(train_set0).cache()
val_dataset = val_set.union(val_set0).cache()

# Separating the test data into a new dataframe
test_dataset = clean_df.where(col("test")==True)

In [7]:
# Dropping Rows with [deleted] Authors from ALL datasets
df4 = train_dataset.where(col("author")!="[deleted]")

print("Rows with [deleted] authors removed from test datasets...\n")
print("Training datapoints before dropping [deleted] authors = {}".format(train_dataset.agg(F.count("author")).first()[0]))
print("Training datapoints after dropping [deleted] authors = {}\n".format(df4.agg(F.count("author")).first()[0]))

In [8]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import udf

In [9]:
document_assembler = DocumentAssembler().setInputCol("titleClean").setOutputCol("document")

In [10]:
document_assembler = DocumentAssembler().setInputCol("titleClean").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
word_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["document", "token"]).setOutputCol("embeddings")
bert_pipeline = Pipeline().setStages([document_assembler,tokenizer,word_embeddings])

# Averaging Embeddings across the whole Document
def avg_vectors(bert_vectors):
  print('1')
  length = len(bert_vectors[0]["embeddings"])
  print('2')
  avg_vec = [0] * length
  print('3')
  for vec in bert_vectors:
    for index, x in enumerate(vec["embeddings"]):
      avg_vec[index] =avg_vec[index]+ x
    avg_vec[index] = avg_vec[index] / length
  return avg_vec

#create a udf
avg_vectors_udf = F.udf(avg_vectors, T.ArrayType(T.DoubleType()))

def dense_vector(vec):
	return Vectors.dense(vec)
  
# Vectorizing in Spark Vector Assembler Format
dense_vector_udf = F.udf(dense_vector, VectorUDT())

In [11]:
pipelineModel = bert_pipeline.fit(train_dataset)

# Train Transformations
train_dataset = pipelineModel.transform(train_dataset)
train_dataset = train_dataset.withColumn("doc_vector", avg_vectors_udf(F.col("embeddings")))

In [12]:
pipelineModel = bert_pipeline.fit(train_dataset)

# Train Transformations
train_dataset = pipelineModel.transform(train_dataset)
train_dataset = train_dataset.withColumn("doc_vector", avg_vectors_udf(F.col("embeddings")))
train_dataset = train_dataset.withColumn("features", dense_vector_udf(F.col("doc_vector")))
train_set = train_dataset.select("features","log_score")


# Val Transformations
val_set = pipelineModel.transform(val_dataset)
val_set = val_set.withColumn("doc_vector", avg_vectors_udf(F.col("embeddings")))
val_set = val_set.withColumn("features", dense_vector_udf(F.col("doc_vector")))
val_set = val_set.select("titleClean","features","log_score")


# Test Transformations
test_set = pipelineModel.transform(test_dataset)
test_set = test_set.withColumn("doc_vector", avg_vectors_udf(F.col("embeddings")))
test_set = test_set.withColumn("features", dense_vector_udf(F.col("doc_vector")))
test_set = test_set.select("titleClean","features","log_score")

In [13]:
# Defining Performance Measurement Functions
# ==========================================

# Importing libraries
from pyspark.ml.evaluation import RegressionEvaluator

# Root mean square error
def rmse(df, predCol="predictedScore", actCol="actualScore"):
  zero = df.where(col(actCol)==0)
  nonzero = df.where(col(actCol)!=0)
  evaluator = RegressionEvaluator(labelCol=actCol, predictionCol=predCol, metricName="rmse")
  rmse_total = evaluator.evaluate(df)
  rmse_zero = evaluator.evaluate(zero)
  rmse_nonzero = evaluator.evaluate(nonzero)
  return [rmse_total, rmse_zero, rmse_nonzero]

# Symmetric mean absolute percentage error
def smape(df, predCol="predictedScore", actCol="actualScore"):
  zero = df.where(col(actCol)==0)
  nonzero = df.where(col(actCol)!=0)
  sm = df.withColumn("sm", 100*((F.abs(col(predCol) - col(actCol)))/(0.5*(F.abs(col(actCol))+F.abs(col(predCol))))))
  sm_zero = zero.withColumn("sm", 100*((F.abs(col(predCol) - col(actCol)))/(0.5*(F.abs(col(actCol))+F.abs(col(predCol))))))
  sm_nonzero = nonzero.withColumn("sm", 100*((F.abs(col(predCol) - col(actCol)))/(0.5*(F.abs(col(actCol))+F.abs(col(predCol))))))
  smape_total = sm.agg(F.mean("sm")).first()[0]
  smape_zero = sm_zero.agg(F.mean("sm")).first()[0]
  smape_nonzero = sm_nonzero.agg(F.mean("sm")).first()[0]
  return [smape_total, smape_zero, smape_nonzero]

# Mean average error
def mae(df, predCol="predictedScore", actCol="actualScore"):
  zero = df.where(col(actCol)==0)
  nonzero = df.where(col(actCol)!=0)
  evaluator = RegressionEvaluator(labelCol=actCol, predictionCol=predCol, metricName="mae")
  mae_total = evaluator.evaluate(df)
  mae_zero = evaluator.evaluate(zero)
  mae_nonzero = evaluator.evaluate(nonzero)
  return [mae_total, mae_zero, mae_nonzero]

# Lightweight regression evaluators
evaluatorR2 = RegressionEvaluator(labelCol="log_score", predictionCol="prediction", metricName="r2")
evaluatorMAE = RegressionEvaluator(labelCol="actualScore", predictionCol="predictedScore", metricName="mae")

In [14]:
# Training Models for Manual Hyperparameter Search
# Due to Cluster Limitations [initializing results list]
# ======================================================

results_rf = []
test_results = []

In [15]:
# Training RF Model for Manual Hyperparameter Search
# Due to Cluster Limitations [training the model]
# ==================================================

# Importing libraries
from pyspark.ml.regression import RandomForestRegressor

# Hyperparameters
num_trees = 40
max_depth = 5

print("Training Random Forest Regressor Model...")
rf = RandomForestRegressor(featuresCol="features", \
						   labelCol="log_score", \
						   numTrees=num_trees, \
						   minInstancesPerNode=5, \
						   maxDepth=max_depth, \
						   minInfoGain=0.00, \
						   featureSubsetStrategy="auto") 

model_rf = rf.fit(train_set)
print("Model Trained")

# Transform validation data
predictions = model_rf.transform(val_set)
output = predictions.withColumn("predictedScore", F.round(F.exp(col("prediction"))-1,0))
output = output.withColumn("actualScore", F.round(F.exp(col("log_score"))-1,0))
print("Validation Data Transformed")

# Transform training data
predictions_tr = model_rf.transform(train_set)
output_tr = predictions_tr.withColumn("predictedScore", F.round(F.exp(col("prediction"))-1,0))
output_tr = output_tr.withColumn("actualScore", F.round(F.exp(col("log_score"))-1,0))
print("Training Data Transformed")

# Printing errors
mae_train = evaluatorMAE.evaluate(output_tr)
print("Train MAE = {}".format(mae_train))
mae_val = evaluatorMAE.evaluate(output)
print("Validation MAE = {}".format(mae_val))

# Append results to list
results_rf.append((num_trees, max_depth, mae_train, mae_val))

In [16]:
# Apply Tuned RF Model to Test Data
# =================================

# Transform test data
predictions_tst = model_rf.transform(test_set)
output_tst = predictions_tst.withColumn("predictedScore", F.round(F.exp(col("prediction"))-1,0))
output_tst = output_tst.withColumn("actualScore", F.round(F.exp(col("log_score"))-1,0))
print("Test Data Transformed")

# Printing errors
mae_test = mae(output_tst)
test_results.append(("Random Forest", mae_test[0], mae_test[1], mae_test[2]))
print("Test MAE = {}".format(mae_test))