# Part 2: Datasets/DataFrames

In [1]:
import os
import re

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, StringIndexer, IDF, StopWordsRemover, \
    Normalizer, CountVectorizer
from pyspark.sql import SparkSession

In [2]:
# Create or retrieve a SparkSession
spark = SparkSession.builder.appName("ChiSquaredPipeline").getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/05/27 17:21:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/27 17:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/27 17:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/27 17:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/05/27 17:21:30 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/05/27 17:21:3

In [3]:
# Extract the 8-digit matriculation number with preceding e from the absolute path of the current working directory
matriculation_number = re.search(r"/e\d{8}/", os.getcwd()).group(0)[1:-1]

# Put the stopwords file in the HDFS home directory for the current user (e.g. /user/e12345678/stopwords.txt)
# Only do this if it does not already exist in the HDFS home directory
if os.system("hdfs dfs -test -e /user/%s/stopwords.txt" % matriculation_number):
    os.system("hdfs dfs -put stopwords.txt /user/%s/stopwords.txt" % matriculation_number)

# Load the review file as an RDD
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"

# Load the stopword file as a set
stopwords_path = "hdfs:///user/%s/stopwords.txt" % matriculation_number
stopwords = set(sc.textFile(stopwords_path).collect())

                                                                                

In [4]:
# Read the review file into a DataFrame
df = spark.read.json(review_path)

                                                                                

In [5]:
# Tokenize the review text into words using a regular expression pattern
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-zA-Z<>^|]+", gaps=True,
                           toLowercase=True)

In [6]:
# Remove stopwords from the tokenized words list using the stopword set
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=list(stopwords))

In [7]:
# Convert the category column to a numeric type
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex", stringOrderType="alphabetAsc")

In [8]:
# Compute the term frequency vector for each document (review)
# tf = HashingTF(inputCol="filtered", outputCol="rawFeatures")
tf = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")

In [9]:
# Compute the inverse document frequency vector for each document (review)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [10]:
# Select the top 2000 features based on the chi-squared test for feature independence
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)

In [11]:
# Create pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, css])

In [12]:
# Fit the pipeline to the DataFrame
pipeline_model = pipeline.fit(df)

23/05/27 17:22:14 WARN DAGScheduler: Broadcasting large task binary with size 1059.7 KiB
23/05/27 17:22:21 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/05/27 17:22:21 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/05/27 17:22:28 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
                                                                                

In [13]:
# Get the vocabulary and selected features
vocab = pipeline_model.stages[3].vocabulary
selected_features = pipeline_model.stages[5].selectedFeatures

In [14]:
# Save the names of the selected features to a file sorted alphabetically in ascending order (space separated)
with open("selected_features.txt", "w") as f:
    f.write(" ".join(sorted([vocab[i] for i in selected_features])))

# Part 3: Text Classification

In [15]:
# Split the data into training, test, and validation sets
training_data, test_data, validation_data = df.randomSplit([0.7, 0.2, 0.1], seed=69)

In [16]:
# Normalize each Vector using $L^2$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)

In [17]:
# Train an SVM classifier on the training data using the pipeline and the selected features as input and the category as output column
svm = LinearSVC(featuresCol="normFeatures", labelCol="categoryIndex", maxIter=10)
ovr = OneVsRest(classifier=svm, featuresCol="normFeatures", labelCol="categoryIndex")

In [18]:
# Create a pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, normalizer, css, ovr])

In [19]:
# Fit the pipeline to the training data
pipeline_model = pipeline.fit(training_data)

23/05/27 17:23:03 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:03 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:08 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:28 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:33 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:37 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/27 17:23:37 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/27 17:23:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/05/27 17:23:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/05/27 17:23:37 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
23/05/27 17:23:38 WARN DAGS

In [20]:
# Make predictions on the test data
predictions = pipeline_model.transform(test_data)

In [21]:
# Compute the F1 score for the test data
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
f1_score

23/05/27 17:28:05 WARN DAGScheduler: Broadcasting large task binary with size 15.7 MiB
                                                                                

0.5812300788452468

In [22]:
f1_scores = {}
# retrieve the number of classes
num_classes = predictions.select("category").distinct().count()
for i in range(num_classes):
    class_predictions = predictions.filter(predictions["categoryIndex"] == i)
    f1_score = evaluator.evaluate(class_predictions)
    f1_scores[i] = f1_score
f1_scores

23/05/27 17:28:23 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:26 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:29 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:32 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:36 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:41 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:45 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:48 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:52 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:55 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:28:59 WARN DAGScheduler: Broadcasting large task binary with size 15.8 MiB
23/05/27 17:29:02 WARN DAGScheduler: Broadc

{0: 0.6991473812423873,
 1: 0.5351351351351351,
 2: 0.5259259259259259,
 3: 0.7884057971014493,
 4: 0.8756476683937823,
 5: 0.8242811501597443,
 6: 0.693953488372093,
 7: 0.8230088495575222,
 8: 0.17679558011049726,
 9: 0.7998469192499044,
 10: 0.6486486486486487,
 11: 0.6089887640449438,
 12: 0.7460674157303369,
 13: 0.1459227467811159,
 14: 0.7418232428670841,
 15: 0.3779527559055118,
 16: 0.6015037593984962,
 17: 0.5101214574898786,
 18: 0.7017543859649124,
 19: 0.5242937853107345,
 20: 0.5182341650671786,
 21: 0.5062836624775584}

In [23]:
spark.stop()