# Part 2: Datasets/DataFrames

In [1]:
import pandas as pd
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, StringIndexer, IDF, StopWordsRemover, \
    Normalizer, CountVectorizer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, TrainValidationSplitModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession

In [2]:
# Create a SparkConf with the following settings:
# - spark.driver.memory: 4g
# - spark.executor.memory: 7392m
# - spark.parallelism: 4
conf = SparkConf() \
    .set("spark.driver.memory", "4g") \
    .set("spark.executor.memory", "7392m") \
    .set("spark.parallelism", "4")

# Create a SparkSession with the name "ChiSquaredPipeline" using the SparkConf above
spark = SparkSession.builder \
    .appName("ChiSquaredPipeline") \
    .config(conf=conf) \
    .getOrCreate()

# Retrieve the SparkContext from the SparkSession
sc = spark.sparkContext

# Set the log level to WARN
sc.setLogLevel("ERROR")

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/05/31 23:30:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/31 23:30:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/31 23:30:28 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/31 23:30:28 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/05/31 23:30:28 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/05/31 23:30:2

In [3]:
# Read the review file into a DataFrame
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(review_path)

                                                                                

In [4]:
# Load the stopword file as a set from the local file system
stopwords = set(open("stopwords.txt").read().splitlines())

In [5]:
# Tokenize the review text into words using a regular expression pattern
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-zA-Z<>^|]+", gaps=True,
                           toLowercase=True)

In [6]:
# Remove stopwords from the tokenized words list using the stopword set
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=list(stopwords))

In [7]:
# Convert the category column to a numeric type using the StringIndexer with alphabetically ascending order to allow for easy mapping to the category names later
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex", stringOrderType="alphabetAsc")

In [8]:
# Compute the term frequency vector for each document (review)
# tf = HashingTF(inputCol="filtered", outputCol="rawFeatures")
tf = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")

In [9]:
# Compute the inverse document frequency vector for each document (review)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [10]:
# Select the top 2000 features based on the chi-squared test for feature independence
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)

In [11]:
# Create pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, css])

In [None]:
# Fit the pipeline to the DataFrame
model = pipeline.fit(df)

In [None]:
# Get the vocabulary and selected features
vocab = model.stages[3].vocabulary
selected_features = model.stages[5].selectedFeatures

In [None]:
# Save the names of the selected features to a file sorted alphabetically in ascending order (space separated)
with open("output_ds.txt", "w") as f:
    f.write(" ".join(sorted([vocab[i] for i in selected_features])))

# Part 3: Text Classification

In [12]:
# Split the data into 80% training and 20% test data using a seed of 42
training_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [13]:
# Normalize each Vector using $L^2$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)

In [14]:
# Create an SVM classifier using the normalized features and the category index
svm = LinearSVC(featuresCol="normFeatures", labelCol="categoryIndex")

In [15]:
# Create a one-vs-rest classifier using the SVM classifier
ovr = OneVsRest(classifier=svm, featuresCol="normFeatures", labelCol="categoryIndex", parallelism=4)

In [16]:
# Create a pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, normalizer, css, ovr])

In [17]:
# Create an evaluator using the F1-score metric
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="f1")

In [18]:
# Create a parameter grid for the SVM classifier with the following parameters:
# - numTopFeatures: 50, 2000
# - regParam: 0.1, 0.01, 0.001
# - standardization: True, False
# - maxIter: 10, 100
param_grid = ParamGridBuilder() \
    .addGrid(css.numTopFeatures, [50, 2000]) \
    .addGrid(svm.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(svm.standardization, [True, False]) \
    .addGrid(svm.maxIter, [10, 100]) \
    .build()

In [19]:
# Create a train-validation-split using the pipeline, parameter grid, and evaluator
tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.8,
                           seed=42, parallelism=8, collectSubModels=True)

In [None]:
# Fit the train-validation-split to the training data
tvs_model = tvs.fit(training_data)

In [None]:
# Write the entire fitted tvs_model to disk
tvs_model.write().overwrite().save("tvs_model")

In [20]:
# Load the fitted tvs_model from disk
tvs_model = TrainValidationSplitModel.load("tvs_model")

                                                                                

In [21]:
# Make predictions on the test data
predictions = tvs_model.transform(test_data)

In [22]:
# Compute the micro-averaged F1-score for the predictions
evaluator.evaluate(predictions)

                                                                                

0.6045748716264853

In [23]:
# Retrieve the validation metrics for each model trained in the train-validation-split
results = tvs_model.validationMetrics

In [24]:
# Create a dictionary with the parameter values and validation metrics for each model trained in the train-validation-split
data = {}

# Iterate over the parameter grid's parameter values and validation metrics
for i, params in enumerate(param_grid):
    # Iterate over the parameters in the parameter grid
    for param_key, param_value in params.items():
        # Add the parameter value to the dictionary
        data.setdefault(param_key.name, []).append(param_value)
    # Add the validation metric to the dictionary
    data.setdefault("Evaluation Metric", []).append(results[i])

# Set the display options for Pandas
pd.set_option('display.float_format', '{:.16g}'.format)

# Create a DataFrame from the dictionary
pd.DataFrame(data)

Unnamed: 0,numTopFeatures,regParam,standardization,maxIter,Evaluation Metric
0,50,0.1,True,10,0.5819229493156207
1,50,0.1,True,100,0.5986381683598597
2,50,0.1,False,10,0.2246326892778781
3,50,0.1,False,100,0.5315731057698442
4,50,0.01,True,10,0.5820901461374169
5,50,0.01,True,100,0.5847428310202759
6,50,0.01,False,10,0.2251149643245378
7,50,0.01,False,100,0.5321925656014915
8,50,0.001,True,10,0.5788854651035343
9,50,0.001,True,100,0.5600604477127051


In [25]:
# Create a list of category names sorted alphabetically in ascending order
category_names = sorted([row["category"] for row in df.select("category").distinct().collect()])

                                                                                

In [26]:
# Get the number of categories
num_classes = len(category_names)

In [27]:
# Calculate multiclass metrics for the test data
metrics = MulticlassMetrics(predictions.select("prediction", "categoryIndex").rdd)

                                                                                

In [28]:
# Select the confusion matrix from the metrics object
confusion_matrix = metrics.confusionMatrix()

# Convert the confusion matrix to a Pandas DataFrame for better visualization mapping the category indices to the category names
pd.DataFrame(confusion_matrix.toArray(), index=category_names, columns=category_names)

                                                                                

Unnamed: 0,Apps_for_Android,Automotive,Baby,Beauty,Book,CDs_and_Vinyl,Cell_Phones_and_Accessorie,Clothing_Shoes_and_Jewelry,Digital_Music,Electronic,...,Home_and_Kitche,Kindle_Store,Movies_and_TV,Musical_Instrument,Office_Product,Patio_Lawn_and_Garde,Pet_Supplie,Sports_and_Outdoor,Tools_and_Home_Improvement,Toys_and_Game
Apps_for_Android,303,2,2,1,111,9,9,5,1,50,...,6,12,15,0,3,4,3,7,6,13
Automotive,4,104,1,2,16,0,10,21,0,41,...,17,1,7,0,1,6,3,16,16,1
Baby,3,6,78,3,10,1,2,9,0,15,...,20,0,1,0,0,2,3,9,6,17
Beauty,2,5,1,231,30,2,14,23,2,18,...,12,4,4,1,1,1,3,11,3,2
Book,39,6,2,6,3917,42,7,26,7,37,...,23,121,101,4,6,8,6,27,6,34
CDs_and_Vinyl,6,1,2,2,94,504,3,11,27,10,...,4,4,46,2,0,2,0,1,0,2
Cell_Phones_and_Accessorie,13,8,1,11,36,1,368,52,1,149,...,6,1,4,1,7,2,2,16,9,7
Clothing_Shoes_and_Jewelry,6,7,8,14,46,4,21,849,5,32,...,15,2,6,2,3,1,3,57,5,13
Digital_Music,5,1,0,0,30,105,2,0,11,6,...,3,3,8,0,1,0,0,0,2,0
Electronic,20,22,6,7,75,9,125,43,2,1110,...,24,5,19,3,12,3,3,25,29,8


In [29]:
# Collect the F1-score for each category into a dataframe
pd.DataFrame([(metrics.fMeasure(float(i))) for i in range(num_classes)], index=category_names, columns=["f1_score"])

Unnamed: 0,f1_score
Apps_for_Android,0.584942084942085
Automotive,0.4185110663983904
Baby,0.456140350877193
Beauty,0.6055045871559633
Book,0.7875741429576757
CDs_and_Vinyl,0.6857142857142857
Cell_Phones_and_Accessorie,0.5356622998544396
Clothing_Shoes_and_Jewelry,0.6832997987927565
Digital_Music,0.0901639344262295
Electronic,0.6624888093106536


In [30]:
# Print the macro-average f1-score using the test data
print("Macro-Average F1 score: %f" % (sum((metrics.fMeasure(float(i))) for i in range(num_classes)) / num_classes))

Macro-Average F1 score: 0.495644


In [31]:
spark.stop()

# Archive

In [None]:
# Create a cross-validator using the pipeline, parameter grid, and evaluator
# cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

# Fit the cross-validator to the training data
# cv_model = cv.fit(training_data)

# Make predictions on the test data
# predictions = cv_model.transform(test_data)

# Compute the micro-averaged F1-score for the predictions
# evaluator.evaluate(predictions)