# Part 2: Datasets/DataFrames

In [None]:
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, StringIndexer, IDF, StopWordsRemover, \
    Normalizer, CountVectorizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession

In [None]:
# Create or retrieve a SparkSession
spark = SparkSession.builder.appName("ChiSquaredPipeline").getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

In [None]:
# Read the review file into a DataFrame
# review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviewscombined.json"
review_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
df = spark.read.json(review_path)

In [None]:
# Load the stopword file as a set from the local file system
stopwords = set(open("stopwords.txt").read().splitlines())

In [None]:
# Tokenize the review text into words using a regular expression pattern
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-zA-Z<>^|]+", gaps=True,
                           toLowercase=True)

In [None]:
# Remove stopwords from the tokenized words list using the stopword set
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=list(stopwords))

In [None]:
# Convert the category column to a numeric type using the StringIndexer with alphabetically ascending order to allow for easy mapping to the category names later
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex", stringOrderType="alphabetAsc")

In [None]:
# Compute the term frequency vector for each document (review)
# tf = HashingTF(inputCol="filtered", outputCol="rawFeatures")
tf = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")

In [None]:
# Compute the inverse document frequency vector for each document (review)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [None]:
# Select the top 2000 features based on the chi-squared test for feature independence
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)

In [None]:
# Create pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, css])

In [None]:
# Fit the pipeline to the DataFrame
pipeline_model = pipeline.fit(df)

In [None]:
# Get the vocabulary and selected features
vocab = pipeline_model.stages[3].vocabulary
selected_features = pipeline_model.stages[5].selectedFeatures

In [None]:
# Save the names of the selected features to a file sorted alphabetically in ascending order (space separated)
with open("selected_features.txt", "w") as f:
    f.write(" ".join(sorted([vocab[i] for i in selected_features])))

# Part 3: Text Classification

In [None]:
# Split the data into training, test, and validation sets
training_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Create a list of category names sorted alphabetically in ascending order
category_names = sorted([row["category"] for row in df.select("category").distinct().collect()])

# Get the number of categories
num_classes = len(category_names)

In [None]:
# Normalize each Vector using $L^2$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)

In [None]:
# Create an SVM classifier using the normalized features and the category index
svm = LinearSVC(featuresCol="normFeatures", labelCol="categoryIndex")

In [None]:
# Create a one-vs-rest classifier using the SVM classifier
ovr = OneVsRest(classifier=svm, featuresCol="normFeatures", labelCol="categoryIndex")

In [None]:
# Create a pipeline combining all steps
pipeline = Pipeline(stages=[tokenizer, remover, indexer, tf, idf, normalizer, css, ovr])

In [None]:
# Create an evaluator for the F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="f1")

In [None]:
# Fit the pipeline to the training data
pipeline_model = pipeline.fit(training_data)

In [None]:
# Make predictions on the test data
predictions = pipeline_model.transform(test_data)

In [None]:
# Compute the micro average F1 score using the test data
f1_score = evaluator.evaluate(predictions)
print("Micro-Average F1 score: %f" % f1_score)

In [None]:
# Calculate multiclass metrics for the test data
metrics = MulticlassMetrics(predictions.select("prediction", "categoryIndex").rdd)

In [None]:
# Select the confusion matrix from the metrics object
confusion_matrix = metrics.confusionMatrix()

# Convert the confusion matrix to a Pandas DataFrame for better visualization mapping the category indices to the category names
df = pd.DataFrame(confusion_matrix.toArray(), index=category_names, columns=category_names)
df

In [None]:
# Print the f1-score for each category using the metrics object
test = 0
for i in range(num_classes):
    f_score = metrics.fMeasure(float(i))
    test += f_score
    print("F1 score for %s: %f" % (category_names[i], f_score))
# Print the macro-average f1-score using the test data
print("Macro-Average F1 score: %f" % (test / num_classes))

In [None]:
# Create a parameter grid for the SVM classifier with the following parameters:
# - numTopFeatures: 50, 2000
# - regParam: 0.1, 0.01, 0.001
# - standardization: True, False
# - maxIter: 10, 100
paramGrid = ParamGridBuilder() \
    .addGrid(css.numTopFeatures, [50, 2000]) \
    .addGrid(svm.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(svm.standardization, [True, False]) \
    .addGrid(svm.maxIter, [10, 100]) \
    .build()

In [None]:
# Create a cross-validator using the pipeline, parameter grid, and evaluator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

In [None]:
# Fit the cross-validator to the training data
cv_model = cv.fit(training_data)

In [None]:
# Check the parameters that were selected by the cross-validator for the best model
for param in paramGrid:
    print(param.name, ":", cv_model.bestModel.getOrDefault(param))

In [None]:
# Make predictions on the test data
predictions = cv_model.transform(test_data)

In [None]:
# Compute the F1 score for the test data
f1_score = evaluator.evaluate(predictions)
f1_score

In [None]:
# Create a train-validation-split using the pipeline, parameter grid, and evaluator
tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, trainRatio=0.8)

In [None]:
# Fit the train-validation-split to the training data
tvs_model = tvs.fit(training_data)

In [None]:
# Make predictions on the test data
predictions = tvs_model.transform(test_data)

In [None]:
# Compute the F1 score for the test data
f1_score = evaluator.evaluate(predictions)
f1_score

In [None]:
spark.stop()