# Part 2

In [1]:
#open spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("A2-Part2-Pipeline").getOrCreate()

#load reviews and stopwords
DEBUG = True
if DEBUG:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
else:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"
stopwordsPath = "Exercise_1/stopwords.txt"
#define structure of json for faster reading
from pyspark.sql import types as T
review_schema = T.StructType([
     T.StructField("reviewerID",      T.StringType(),  True),
     T.StructField("asin",            T.StringType(),  True),
     T.StructField("reviewerName",    T.StringType(),  True),
     T.StructField("helpful",         T.ArrayType(T.IntegerType()), True),
     T.StructField("reviewText",      T.StringType(),  True),
     T.StructField("overall",         T.FloatType(),   True),
     T.StructField("summary",         T.StringType(),  True),
     T.StructField("unixReviewTime",  T.LongType(),    True),
     T.StructField("reviewTime",      T.StringType(),  True),
     T.StructField("category",        T.StringType(),  True),
 ])
#read and select category and review
df = (
    spark.read
         .schema(review_schema)
         .json(RAW_PATH)
         .selectExpr("reviewText AS text",
                     "category")
    .na.drop(subset=["text", "category"])
)

# reading the stopwords
stopwords = spark.sparkContext.textFile(stopwordsPath).collect()


SLF4J: Class path contains multiple SLF4J bindings.


25/05/11 00:07:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Build the pipeline

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer,
    StopWordsRemover,
    CountVectorizer,
    IDF,
    ChiSqSelector,
    StringIndexer
)

# 1 Tokenisation and lower-casing via RegexTokenizer
tokenizer = RegexTokenizer(
    inputCol="text",
    outputCol="tokens",
    pattern=r"""[ \t0-9()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",  # delimiters
    gaps=True,                # pattern defines the split points
    toLowercase=True,
)

# 2 Stopword removal
stopper = StopWordsRemover(inputCol="tokens",stopWords = stopwords, outputCol="clean_tokens")


# 3 Vectorizing
cv = CountVectorizer(
    inputCol="clean_tokens",
    outputCol="tf",
    minDF=2,
    vocabSize=50_000, 
)

# 4 IDF weighting
idf = IDF(inputCol="tf", outputCol="tfidf")

# 5 encode the category column from string to int
encoder = StringIndexer(inputCol="category", outputCol="label")

# 6 select top 2000 terms by chi²
selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="tfidf",
    outputCol="chi2_features",
    labelCol="label",
)

# 7 Pipeline assembly
pipeline = Pipeline(stages=[tokenizer, stopper, cv, idf, encoder, selector])

Fit pipeline

In [3]:
# Fit pipeline
spark.conf.set("spark.sql.shuffle.partitions", "128")
df.persist() # persisting intermediate output, better for large datasets
model = pipeline.fit(df)
df.unpersist()

# Extract vocabulary & selected indices
vocab = model.stages[2].vocabulary                        # get vocabulary from Vecorizer
selected = model.stages[-1].selectedFeatures             # index of term after selector

selected_terms = [vocab[i] for i in selected]            # map indices to terms

Output file

In [4]:
# Saves top 2000 terms to output_ds.txt (one term per line)
import pathlib, os, codecs

out_file = pathlib.Path("output_ds.txt")
out_file.write_text("\n".join(selected_terms), encoding="utf-8")
print(f"Wrote {len(selected_terms)} terms to {out_file.resolve()}")

Wrote 2000 terms to /home/e12412694/DIC/Exercise_2/output_ds.txt


In [5]:

# ────────────────────────────────────────────────────────────────────────
# Optional: automatic comparison with Assignment 1
# ────────────────────────────────────────────────────────────────────────
# old_terms = pathlib.Path("assignment1_terms.txt").read_text(encoding="utf-8").splitlines()
# old_set, new_set = set(old_terms), set(selected_terms)
# print("\n🔹  Terms kept in *both* assignments:", len(old_set & new_set))
# print("🔸  Terms only in Assignment 1:",      len(old_set - new_set))
# print("🔸  Terms only in Spark pipeline:",    len(new_set - old_set))
# (You can also diff the two files directly with any text-diff tool.)


In [6]:
# Step 1: Read the file and get the last line (merged vocabulary)
lines = pathlib.Path("../Exercise_1/src/output_dev.txt").read_text(encoding="utf-8").splitlines()
merged_vocab_line = lines[-1]  # This is the merged vocab line

# Step 2: Split merged vocab into terms
old_terms = merged_vocab_line.strip().split()
old_set = set(old_terms)

# Step 3: Load selected terms from Spark pipeline (output_ds.txt)
new_terms = pathlib.Path("output_ds.txt").read_text(encoding="utf-8").splitlines()
new_set = set(term.strip() for term in new_terms)

# Step 4: Compare sets
common_terms = old_set & new_set
only_in_old = old_set - new_set
only_in_new = new_set - old_set

# Step 5: Print results
print(f"Terms in BOTH assignments: {len(common_terms)}")
print(f"Terms ONLY in Assignment 1: {len(only_in_old)}")
print(f"Terms ONLY in Spark pipeline (Assignment 2): {len(only_in_new)}")
print(f"Overlap: {len(common_terms) / len(new_set) * 100:.2f}% of Spark-selected terms are also in Assignment 1")

Terms in BOTH assignments: 2000
Terms ONLY in Assignment 1: 97017
Terms ONLY in Spark pipeline (Assignment 2): 0
Overlap: 100.00% of Spark-selected terms are also in Assignment 1


# Part 3 
// Not working yet

In [7]:
#df_3 = model.transform(df).select("label","chi2_features").toDF("label","chi2_features")
#display(df_3)

In [8]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.classification import LinearSVC,  OneVsRest
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
spark.sparkContext.setLogLevel("ERROR")   # WARN-Meldungen verschwinden

In [12]:
# L2 normalization of the selected features
normalizer = Normalizer(inputCol="chi2_features", outputCol="norm_features", p=2.0)

# Binary SVM
svm = LinearSVC(
    featuresCol="norm_features",
    labelCol="label",
    maxIter=2, # for testing
    regParam=0.1,
    standardization=True
)

# One-vs-Rest strategy for multi-class SVM
ovr = OneVsRest(
    classifier=svm,
    featuresCol="norm_features",
    labelCol="label"
)

# Update pipeline
pipeline = Pipeline(stages=[
    tokenizer,
    stopper,
    cv,
    idf,
    encoder,
    selector,
    normalizer,
    ovr
])

# Split data
train, val, test = df.randomSplit([0.7, 0.15, 0.15], seed=42)

# Init evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
    )

In [10]:
# Train pipeline
model = pipeline.fit(train)

# Predict
predictions = model.transform(test)

# Evaluate
f1 = evaluator.evaluate(predictions)
print(f"Test F1 score: {f1:.4f}")

# took like 15mins

Test F1 score: 0.5608


In [None]:
# Build parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(svm.maxIter, [5, 10]) \
    .addGrid(svm.standardization, [True, False]) \
    .build()

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2,
    seed=42
)

# Train cv
cv_model = cv.fit(train)

In [None]:
avg_metrics = cv_model.avgMetrics
param_maps = cv.getEstimatorParamMaps()

# Check all models
print("Cross-Validation Results:")
for i, (params, score) in enumerate(zip(param_maps, avg_metrics)):
    print(f"Model {i+1}: F1 = {score:.4f}, Params: {params}")

# Get the best model
best_model = cv_model.bestModel
print("Best Model Params:")
for stage in best_model.stages:
    if hasattr(stage, 'extractParamMap'):
        print(stage.__class__.__name__, stage.extractParamMap())

# Evaluate best model on the test set
predictions_test = best_model.transform(test)
f1_test = evaluator.evaluate(predictions_test)
print(f"Best Model F1 Score on Test Set: {f1_test:.4f}")

In [10]:
# prev ver
transformedData = model.transform(df)df=transformedData
df=df.sample(fraction=0.01, seed=4242)
df2=df.select("label", "chi2_features").toDF("label", "selected")
normalizer = Normalizer().setInputCol("selected").setOutputCol("normalized").setP(2.0)
df_norm =normalizer.transform(df2)
df3=df_norm.select("label", "normalized").toDF("label", "normalized")
train,val, test = df3.randomSplit([0.7,0.15, 0.15], seed = 4242)
lsvc = LinearSVC(featuresCol="normalized", labelCol="label", maxIter=10)
ovr = OneVsRest(classifier=lsvc, featuresCol="normalized", labelCol="label")
ovr_model = ovr.fit(train)
param_grid_dict = {
    "classifier__regParam": [0.001, 0.01, 0.1],
    "classifier__standardization": [True, False],
    "classifier__maxIter": [10, 8]
}
param_grid_builder = ParamGridBuilder()
for param, values in param_grid_dict.items():   
    param_grid_builder = param_grid_builder.addGrid(getattr(lsvc, param.split("__")[1]), values)

# Building the parameter grid using the added grids
param_grid = param_grid_builder.build()
# The F1 score is the harmonic mean of precision and recall
evaluator=MulticlassClassificationEvaluator(metricName="f1")
cv=CrossValidator(estimator=ovr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=2)
val.groupBy("label").count().show()
cv_model=cv.fit(val)
best_model=cv_model.bestModel
ovr_predictions_test = ovr_model.transform(test)
ovr_f1_score = evaluator.evaluate(ovr_predictions_test)
print(f"OVR F1 Score: {ovr_f1_score}")
best_model_predictions_test = best_model.transform(test)
best_model_f1_score = evaluator.evaluate(best_model_predictions_test)
print(f"Best Model F1 Score: {best_model_f1_score}")

KeyboardInterrupt: 

In [None]:
# ── 1.  Einmalige Feature-Extraktion  ───────────────────────────────────────
spark.sparkContext.setLogLevel("ERROR")   # WARN-Meldungen verschwinden
features_df = (
    model            # CountVectorizer + IDF + Chi² (fitted)
    .transform(df)         # ➜ neue Spalten: tf, tfidf, chi2_features
    .select("chi2_features", "label")        # + evtl. andere Meta-Spalten
    .cache()
)
print(features_df.count(), "Dokumente nach Feature-Extraktion")

# ── 2.  Light-Pipeline für den Klassifikator  ───────────────────────────────
from pyspark.ml import Pipeline
from pyspark.ml.feature import Normalizer, StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

indexer   = StringIndexer(inputCol="label",  outputCol="label_idx")
norm      = Normalizer(inputCol="chi2_features", outputCol="features", p=2.0)
svm_bin   = LinearSVC(featuresCol="features", labelCol="label_idx", maxIter=10)
ovr       = OneVsRest(classifier=svm_bin, featuresCol="features", labelCol="label_idx")

pipe3 = Pipeline(stages=[indexer, norm, ovr])

param_grid = (
    ParamGridBuilder()
      .addGrid(svm_bin.regParam,        [0.01])
      .addGrid(svm_bin.standardization, [True])
      .addGrid(svm_bin.maxIter,         [50, 200])
      .build()
)

tvs = TrainValidationSplit(
    estimator=pipe3,
    estimatorParamMaps=param_grid,
    evaluator=MulticlassClassificationEvaluator(labelCol="label_idx", metricName="f1"),
    trainRatio=0.8,
    seed=42,
    parallelism=4,
)

train_df, test_df = features_df.randomSplit([0.8, 0.2], seed=42)
best_model = tvs.fit(train_df).bestModel
print("F1 on test:",
      MulticlassClassificationEvaluator(labelCol="label_idx", metricName="f1")
      .evaluate(best_model.transform(test_df)))
