# Part 2: Datasets/DataFrames

In [None]:
import pandas as pd
from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, StringIndexer, IDF, StopWordsRemover, \
    Normalizer, CountVectorizer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, TrainValidationSplitModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession

In [None]:
# Create a SparkSession with the name "ChiSquaredPipeline"
spark = SparkSession.builder \
    .appName("ChiSquaredPipeline") \
    .getOrCreate()

# Retrieve the SparkContext from the SparkSession
sc = spark.sparkContext

# Set the log level to WARN
sc.setLogLevel("ERROR")

In [None]:
# Read the review file into a DataFrame
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
review_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"
# review_path = "hdfs:///user/e11809642/reviews/tiny_devset.json"
df = spark.read.json(review_path)

In [None]:
# Load the stopword file as a set from the local file system
stopwords = set(open("stopwords.txt").read().splitlines())

In [None]:
# Tokenize the review text into words using a regular expression pattern
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-zA-Z<>^|]+", gaps=True,
                           toLowercase=True)

In [None]:
# Remove stopwords from the tokenized words list using the stopword set
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=list(stopwords))

In [None]:
# Convert the category column to a numeric type using the StringIndexer with alphabetically ascending order to allow for easy mapping to the category names later
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex", stringOrderType="alphabetAsc")

In [None]:
# Compute the term frequency vector for each document (review)
# tf = HashingTF(inputCol="filtered", outputCol="rawFeatures")
tf = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")

In [None]:
# Compute the inverse document frequency vector for each document (review)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [None]:
# Select the top 2000 features based on the chi-squared test for feature independence
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)

In [None]:
# Create pipeline combining all steps\n",
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, css])

In [None]:
# Fit the pipeline to the DataFrame
# model = pipeline.fit(df)

In [None]:
# Get the vocabulary and selected features
# vocab = model.stages[3].vocabulary
# selected_features = model.stages[5].selectedFeatures

In [None]:
# Save the names of the selected features to a file sorted alphabetically in ascending order (space separated)
# with open("output_ds.txt", "w") as f:
    # f.write(" ".join(sorted([vocab[i] for i in selected_features])))

# Part 3: Text Classification

In [None]:
# Split the data into 80% training and 20% test data using a seed of 42
training_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Normalize each Vector using $L^2$ norm.
normalizer = Normalizer(inputCol="selectedFeatures", outputCol="normFeatures", p=2.0)

In [None]:
# Create an SVM classifier using the normalized features and the category index
svm = LinearSVC(featuresCol="normFeatures", labelCol="categoryIndex")

In [None]:
# Create a one-vs-rest classifier using the SVM classifier
ovr = OneVsRest(classifier=svm, featuresCol="normFeatures", labelCol="categoryIndex", parallelism=4)

In [None]:
# Create a pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, css, normalizer, ovr])

In [None]:
# Create an evaluator using the F1-score metric
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="f1")

In [None]:
# Create a parameter grid for the SVM classifier with the following parameters:
# - numTopFeatures: 50, 2000
# - regParam: 0.1, 0.01, 0.001
# - standardization: True, False
# - maxIter: 10, 100
param_grid = ParamGridBuilder() \
    .addGrid(css.numTopFeatures, [50, 2000]) \
    .addGrid(svm.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(svm.standardization, [True, False]) \
    .addGrid(svm.maxIter, [10, 100]) \
    .build()

In [None]:
# Create a train-validation-split using the pipeline, parameter grid, and evaluator
tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.8,
                           seed=42, parallelism=4, collectSubModels=True)

In [None]:
# Fit the train-validation-split to the training data
tvs_model = tvs.fit(training_data)

In [None]:
# Write the entire fitted tvs_model to disk
tvs_model.write().overwrite().save("tvs_model")

In [None]:
# Load the fitted tvs_model from disk
tvs_model = TrainValidationSplitModel.load("tvs_model")

In [None]:
# Make predictions on the test data
predictions = tvs_model.transform(test_data)

In [None]:
# Compute the micro-averaged F1-score for the predictions
evaluator.evaluate(predictions)

In [None]:
# Retrieve the validation metrics for each model trained in the train-validation-split
results = tvs_model.validationMetrics

In [None]:
# Create a dictionary with the parameter values and validation metrics for each model trained in the train-validation-split
data = {}

# Iterate over the parameter grid's parameter values and validation metrics
for i, params in enumerate(param_grid):
    # Iterate over the parameters in the parameter grid
    for param_key, param_value in params.items():
        # Add the parameter value to the dictionary
        data.setdefault(param_key.name, []).append(param_value)
    # Add the validation metric to the dictionary
    data.setdefault("Evaluation Metric", []).append(results[i])

# Set the display options for Pandas
pd.set_option('display.float_format', '{:.16g}'.format)

# Create a DataFrame from the dictionary
pd.DataFrame(data)

In [None]:
# Create a list of category names sorted alphabetically in ascending order
category_names = sorted([row["category"] for row in df.select("category").distinct().collect()])

In [None]:
# Get the number of categories
num_classes = len(category_names)

In [None]:
# Calculate multiclass metrics for the test data
metrics = MulticlassMetrics(predictions.select("prediction", "categoryIndex").rdd)

In [None]:
# Select the confusion matrix from the metrics object
confusion_matrix = metrics.confusionMatrix()

# Convert the confusion matrix to a Pandas DataFrame for better visualization mapping the category indices to the category names
pd.DataFrame(confusion_matrix.toArray(), index=category_names, columns=category_names)

In [None]:
# Collect the F1-score for each category into a dataframe
pd.DataFrame([(metrics.fMeasure(float(i))) for i in range(num_classes)], index=category_names, columns=["f1_score"])

In [None]:
# Print the macro-average f1-score using the test data
print("Macro-Average F1 score: %f" % (sum((metrics.fMeasure(float(i))) for i in range(num_classes)) / num_classes))

In [None]:
spark.stop()

# Archive

In [None]:
# Create a cross-validator using the pipeline, parameter grid, and evaluator
# cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

# Fit the cross-validator to the training data
# cv_model = cv.fit(training_data)

# Make predictions on the test data
# predictions = cv_model.transform(test_data)

# Compute the micro-averaged F1-score for the predictions
# evaluator.evaluate(predictions)