In [1]:
#open spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("A2-Part2-Pipeline").getOrCreate()

#load reviews and stopwords
DEBUG = True
if DEBUG:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
else:
    RAW_PATH = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"
stopwordsPath = "Exercise_1/assets/stopwords.txt"
#define structure of json for faster reading
from pyspark.sql import types as T
review_schema = T.StructType([
     T.StructField("reviewerID",      T.StringType(),  True),
     T.StructField("asin",            T.StringType(),  True),
     T.StructField("reviewerName",    T.StringType(),  True),
     T.StructField("helpful",         T.ArrayType(T.IntegerType()), True),
     T.StructField("reviewText",      T.StringType(),  True),
     T.StructField("overall",         T.FloatType(),   True),
     T.StructField("summary",         T.StringType(),  True),
     T.StructField("unixReviewTime",  T.LongType(),    True),
     T.StructField("reviewTime",      T.StringType(),  True),
     T.StructField("category",        T.StringType(),  True),
 ])
#read and select category and review
df = (
    spark.read
         .schema(review_schema)
         .json(RAW_PATH)
         .selectExpr("reviewText AS text",
                     "category")
    .na.drop(subset=["text", "category"])
)

# reading the stopwords
stopwords = spark.sparkContext.textFile(stopwordsPath).collect()


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/05/10 04:31:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/10 04:31:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/10 04:31:06 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


                                                                                

Build the pipeline

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer,
    StopWordsRemover,
    CountVectorizer,
    IDF,
    ChiSqSelector,
    StringIndexer
)

# 1: Tokenisation and lower-casing via RegexTokenizer
tokenizer = RegexTokenizer(
    inputCol="text",
    outputCol="tokens",
    pattern=r"""[ \t0-9()\[\]{}.!?,;:+=\-_"'`~#@&*%€$§\\/]+""",  # delimiters
    gaps=True,                # pattern defines the split points
    toLowercase=True,
)

# 2️: Stopword removal
stopper = StopWordsRemover(inputCol="tokens",stopWords = stopwords, outputCol="clean_tokens")

# 3️: Vercorizing
cv = CountVectorizer(
    inputCol="clean_tokens",
    outputCol="tf",
    minDF=2,
    vocabSize=50_000, 
)

# 4️: IDF weighting
idf = IDF(inputCol="tf", outputCol="tfidf")

# 5: encode the category column from string to int
encoder = StringIndexer(inputCol="category", outputCol="label")

# 5️: select top 2000 terms by chi²
selector = ChiSqSelector(
    numTopFeatures=2_000,
    featuresCol="tfidf",
    outputCol="chi2_features",
    labelCol="label",
)

# 6️ Pipeline assembly
pipeline = Pipeline(stages=[tokenizer, stopper, cv, idf, encoder, selector])

Fit pipeline

In [3]:
# Fit pipeline
spark.conf.set("spark.sql.shuffle.partitions", "128")
model = pipeline.fit(df)

# Extract vocabulary & selected indices
vocab = model.stages[2].vocabulary                        # get vocabulary from Vecorizer
selected = model.stages[-1].selectedFeatures             # index of term after selector

selected_terms = [vocab[i] for i in selected]            # map indices to terms

                                                                                

25/05/10 04:32:08 WARN DAGScheduler: Broadcasting large task binary with size 1238.7 KiB
25/05/10 04:32:08 WARN DAGScheduler: Broadcasting large task binary with size 1240.8 KiB


                                                                                

25/05/10 04:32:17 WARN DAGScheduler: Broadcasting large task binary with size 1242.8 KiB


                                                                                

Output file

In [4]:
# Saves top 2000 terms to output_ds.txt (one term per line)
import pathlib, os, codecs

out_file = pathlib.Path("output_ds.txt")
out_file.write_text("\n".join(selected_terms), encoding="utf-8")
print(f"  Wrote {len(selected_terms)} terms to {out_file.resolve()}")

  Wrote 2000 terms to /home/e12436447/Exercise_1/src/output_ds.txt


In [5]:

# ────────────────────────────────────────────────────────────────────────
# Optional: automatic comparison with Assignment 1
# ────────────────────────────────────────────────────────────────────────
# old_terms = pathlib.Path("assignment1_terms.txt").read_text(encoding="utf-8").splitlines()
# old_set, new_set = set(old_terms), set(selected_terms)
# print("\n🔹  Terms kept in *both* assignments:", len(old_set & new_set))
# print("🔸  Terms only in Assignment 1:",      len(old_set - new_set))
# print("🔸  Terms only in Spark pipeline:",    len(new_set - old_set))
# (You can also diff the two files directly with any text-diff tool.)


Part 3 // Not working yet

In [6]:
#df_3 = model.transform(df).select("label","chi2_features").toDF("label","chi2_features")


In [7]:
#display(df_3)

In [None]:
transformedData = model.transform(df)

In [13]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.classification import LinearSVC,  OneVsRest
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
spark.sparkContext.setLogLevel("ERROR")   # WARN-Meldungen verschwinden
df=transformedData
df=df.sample(fraction=0.01, seed=4242)
df2=df.select("label", "chi2_features").toDF("label", "selected")
normalizer = Normalizer().setInputCol("selected").setOutputCol("normalized").setP(2.0)
df_norm =normalizer.transform(df2)
df3=df_norm.select("label", "normalized").toDF("label", "normalized")
train,val, test = df3.randomSplit([0.7,0.15, 0.15], seed = 4242)
lsvc = LinearSVC(featuresCol="normalized", labelCol="label", maxIter=10)
ovr = OneVsRest(classifier=lsvc, featuresCol="normalized", labelCol="label")
ovr_model = ovr.fit(train)
param_grid_dict = {
    "classifier__regParam": [0.001, 0.01, 0.1],
    "classifier__standardization": [True, False],
    "classifier__maxIter": [10, 8]
}
param_grid_builder = ParamGridBuilder()
for param, values in param_grid_dict.items():   
    param_grid_builder = param_grid_builder.addGrid(getattr(lsvc, param.split("__")[1]), values)

# Building the parameter grid using the added grids
param_grid = param_grid_builder.build()
# The F1 score is the harmonic mean of precision and recall
evaluator=MulticlassClassificationEvaluator(metricName="f1")
cv=CrossValidator(estimator=ovr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=2)
val.groupBy("label").count().show()
cv_model=cv.fit(val)
best_model=cv_model.bestModel
ovr_predictions_test = ovr_model.transform(test)
ovr_f1_score = evaluator.evaluate(ovr_predictions_test)
print(f"OVR F1 Score: {ovr_f1_score}")
best_model_predictions_test = best_model.transform(test)
best_model_f1_score = evaluator.evaluate(best_model_predictions_test)
print(f"Best Model F1 Score: {best_model_f1_score}")

                                                                                

+-----+-----+
|label|count|
+-----+-----+
| 20.0|    2|
| 11.0|    5|
|  5.0|    7|
|  7.0|    8|
| 12.0|    3|
|  0.0|   41|
| 16.0|    1|
| 21.0|    1|
| 10.0|    2|
|  4.0|    6|
|  3.0|    3|
|  1.0|   14|
|  2.0|    7|
|  6.0|    7|
| 13.0|    5|
|  8.0|    7|
| 15.0|    2|
| 19.0|    5|
|  9.0|    6|
| 17.0|    2|
+-----+-----+



                                                                                

25/05/10 04:41:42 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_d9890a24295e__labelCol
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.classification.LinearSVC.$anonfun$train$1(LinearSVC.scala:212)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:171)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:76)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at sun.reflect.GeneratedMethodAccessor85.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.

IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_d9890a24295e__labelCol

In [None]:
# ── 1.  Einmalige Feature-Extraktion  ───────────────────────────────────────
spark.sparkContext.setLogLevel("ERROR")   # WARN-Meldungen verschwinden
features_df = (
    model            # CountVectorizer + IDF + Chi² (fitted)
    .transform(df)         # ➜ neue Spalten: tf, tfidf, chi2_features
    .select("chi2_features", "label")        # + evtl. andere Meta-Spalten
    .cache()
)
print(features_df.count(), "Dokumente nach Feature-Extraktion")

# ── 2.  Light-Pipeline für den Klassifikator  ───────────────────────────────
from pyspark.ml import Pipeline
from pyspark.ml.feature import Normalizer, StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

indexer   = StringIndexer(inputCol="label",  outputCol="label_idx")
norm      = Normalizer(inputCol="chi2_features", outputCol="features", p=2.0)
svm_bin   = LinearSVC(featuresCol="features", labelCol="label_idx", maxIter=10)
ovr       = OneVsRest(classifier=svm_bin, featuresCol="features", labelCol="label_idx")

pipe3 = Pipeline(stages=[indexer, norm, ovr])

param_grid = (
    ParamGridBuilder()
      .addGrid(svm_bin.regParam,        [0.01])
      .addGrid(svm_bin.standardization, [True])
      .addGrid(svm_bin.maxIter,         [50, 200])
      .build()
)

tvs = TrainValidationSplit(
    estimator=pipe3,
    estimatorParamMaps=param_grid,
    evaluator=MulticlassClassificationEvaluator(labelCol="label_idx", metricName="f1"),
    trainRatio=0.8,
    seed=42,
    parallelism=4,
)

train_df, test_df = features_df.randomSplit([0.8, 0.2], seed=42)
best_model = tvs.fit(train_df).bestModel
print("F1 on test:",
      MulticlassClassificationEvaluator(labelCol="label_idx", metricName="f1")
      .evaluate(best_model.transform(test_df)))
