## Init

In [1]:
// --- IMPORTS ---
import org.apache.spark.sql.SparkSession
import scala.util.parsing.json.JSON
import scala.util.matching.Regex
import org.apache.spark.sql.types._
import scala.reflect.io.File

Intitializing Scala interpreter ...

Spark Web UI available at http://lbdmg01.datalab.novalocal:9999/proxy/application_1745308556449_5876
SparkContext available as 'sc' (version = 3.3.4, master = yarn, app id = application_1745308556449_5876)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import scala.util.parsing.json.JSON
import scala.util.matching.Regex
import org.apache.spark.sql.types._
import scala.reflect.io.File


In [2]:
val stopwordPath = "Exercise_1/stopwords.txt"
val DELIMS = "[()\\[\\]{}.!?,;:+=\\-_\"'`~#@&*%€§\\\\/0-9]+"
val TOP_K = 75

stopwordPath: String = Exercise_1/stopwords.txt
DELIMS: String = [()\[\]{}.!?,;:+=\-_"'`~#@&*%€§\\/0-9]+
TOP_K: Int = 75


# Ver1

In [5]:
//val inputPath = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
val inputPath = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"
val outputPath = "output_rdd_full.txt"

inputPath: String = hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json
outputPath: String = output_rdd_full.txt


In [None]:
var start0 = System.nanoTime()

// --- LOAD STOPWORDS ---
var start = System.nanoTime()
val stopwords = sc.textFile(stopwordPath).collect().toSet
val stopwordsBroadcast = sc.broadcast(stopwords)

// --- PARSE FILE ---
// faster parsing the file by defining the structure of the json
val reviewSchema = StructType(Seq(
  StructField("reviewerID"    , StringType),     // e.g. "A2SUAM1J3GNN3B"
  StructField("asin"          , StringType),     // product ID
  StructField("reviewerName"  , StringType),
  StructField("helpful"       , ArrayType(IntegerType)), // [a,b]
  StructField("reviewText"    , StringType),     // full body
  StructField("overall"       , DoubleType),     // rating 1-5 (float in source)
  StructField("summary"       , StringType),     // review title
  StructField("unixReviewTime", LongType),
  StructField("reviewTime"    , StringType),
  StructField("category"      , StringType)      // our label
))

val reviews = spark.read
  .schema(reviewSchema)                 
  .option("mode","DROPMALFORMED")       // skip corrupt lines
  .json(inputPath)
  .filter($"category".isNotNull)        
  .select("category","reviewText","summary") 
  .cache()
val parsed = reviews.as[(String,String,String)].rdd
// At this point the summary field will be omitted. If it is relevant, we need to merge it to the text.

println(f"Parsing Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- TOKENIZER ---
start = System.nanoTime()

//Define tokenize method
//removes stopwords, replaces all delims with a whitespace and splits at ehitespaces after. Also transformes to lower case
def tokenize(text: String, stopwords: Set[String]): Set[String] = {
  if (text == null) return Set.empty
  val cleaned = text.toLowerCase.replaceAll(DELIMS, " ")
  cleaned.split("\\s+").filter(t => t.length > 1 && !stopwords.contains(t)).toSet
}

// call tokenize on parse json and stopwords
val tokenized = parsed.map {
  case (category, text, summary) =>
    val tokens = tokenize(text, stopwordsBroadcast.value)
    (category, tokens)
}

println(f"Tokenizer Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- COUNTING ---
println("Counting")
start = System.nanoTime()
val tokenCatAndDocStats = tokenized.flatMap {
  case (cat, tokens) =>
    val tokenSet = tokens.toSet
    val tokenPairs = tokenSet.map(token => ((token, cat), 1))
    val docMarker = Seq((("!DOC_COUNT", cat), 1))
    tokenPairs.toSeq ++ docMarker
}.reduceByKey(_ + _)
println(f"   tokenCatAndDocStats: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
val docCounts = tokenCatAndDocStats
  .filter(_._1._1 == "!DOC_COUNT")
  .map { case ((_, cat), count) => (cat, count) }
  .collectAsMap()
val totalDocs = docCounts.values.sum
val docCountsBroadcast = sc.broadcast(docCounts)
val totalDocsBroadcast = sc.broadcast(totalDocs)
println(f"   docCounts: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
val tokenCatCounts = tokenCatAndDocStats
  .filter(_._1._1 != "!DOC_COUNT")
println(f"   tokenCatCounts: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
val tokenTotals = tokenCatCounts
  .map { case ((token, _), count) => (token, count) }
  .reduceByKey(_ + _)
  .collectAsMap()
val tokenTotalsBroadcast = sc.broadcast(tokenTotals)
println(f"   tokenTotals: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- CHI-SQUARE CALCULATION ---
start = System.nanoTime()
val N = totalDocsBroadcast.value.toDouble
val chi2Scores = tokenCatCounts.map {
  case ((token, cat), aCount) =>
    val A = aCount.toDouble
    val T = tokenTotalsBroadcast.value.getOrElse(token, 0).toDouble
    val C = docCountsBroadcast.value.getOrElse(cat, 0).toDouble
    val B = T - A
    val D = N - C - B - A
    val denom = (A + B) * (C + D) * (A + C) * (B + D)
    val chi2 = if (denom == 0) 0.0 else N * math.pow((A * D - B * C), 2) / denom
    (cat, (token, chi2))
}
println(f"Chi-Square Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- PREPARE OUTPUT
start = System.nanoTime()
// get top k elements
val topTokensPerCategory = chi2Scores
  .groupByKey()
  .mapValues(iter => iter.toSeq.sortBy(-_._2).take(TOP_K))

// merging vocabluary
val mergedVocab = topTokensPerCategory.flatMap(_._2.map(_._1)).distinct().collect().sorted

println(f"Prepare Output: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- EXPORT ---
start = System.nanoTime()
// format output as: <category> [term:chi2]
val output = topTokensPerCategory.map { case (category, terms) =>
  val formattedTerms = terms.map { case (term, chi2) =>
    s"$term:$chi2"
  }.mkString(" ")
  s"<$category> $formattedTerms"
}

//create and save to output file
val file = File(outputPath)
file.writeAll(output.collect().mkString("\n"))

// append the sorted vocab to file
file.appendAll("\n" + mergedVocab.mkString(" "))

println(f"Export Output: ${(System.nanoTime() - start) / 1e9}%.3f sec")

println(f"Total Runtime: ${(System.nanoTime() - start0) / 1e9}%.3f sec")

Parsing Time: 11.764 sec
Tokenizer Time: 0.764 sec
Counting
   tokenCatAndDocStats: 0.097 sec
   docCounts: 3886.985 sec
   tokenCatCounts: 0.011 sec


Devset

Parsing Time: 0.965 sec
Tokenizer Time: 0.039 sec
Counting
   tokenCatAndDocStats: 0.067 sec
   docCounts: 10.896 sec
   tokenCatCounts: 0.010 sec
   tokenTotals: 2.542 sec
Chi-Square Time: 0.020 sec
Prepare Output: 1.966 sec
Export Output: 0.671 sec
Total Runtime: 17.180 sec

# Ver2

In [4]:
//val inputPath = "hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json"
val inputPath = "hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json"
val outputPath = "output_rdd2_full.txt"

inputPath: String = hdfs:///user/dic25_shared/amazon-reviews/full/reviewscombined.json
outputPath: String = output_rdd2_full.txt


In [6]:
var start0 = System.nanoTime()

// --- LOAD STOPWORDS ---
var start = System.nanoTime()
val stopwords = sc.textFile(stopwordPath).collect().toSet
val stopwordsBroadcast = sc.broadcast(stopwords)

// --- PARSE FILE --- 
//faster parsing the file by defining the structure of the json
val reviewSchema = StructType(Seq(
  StructField("reviewerID"    , StringType),     // e.g. "A2SUAM1J3GNN3B"
  StructField("asin"          , StringType),     // product ID
  StructField("reviewerName"  , StringType),
  StructField("helpful"       , ArrayType(IntegerType)), // [a,b]
  StructField("reviewText"    , StringType),     // full body
  StructField("overall"       , DoubleType),     // rating 1-5 (float in source)
  StructField("summary"       , StringType),     // review title
  StructField("unixReviewTime", LongType),
  StructField("reviewTime"    , StringType),
  StructField("category"      , StringType)      // our label
))
val reviews = spark.read
  .schema(reviewSchema)                 
  .option("mode","DROPMALFORMED")       // skip corrupt lines
  .json(inputPath)
  .filter($"category".isNotNull)        
  // .select($"category", concat_ws(" ", $"reviewText", $"summary").as("text")) // takes too long even on devset
  .select($"category", $"reviewText")  // only reviewText
  .cache()
val parsed = reviews.as[(String,String)].rdd
println(f"Parsing Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- TOKENIZER ---
start = System.nanoTime()
//removes stopwords, replaces all delims with a whitespace and splits at ehitespaces after. Also transformes to lower case
def tokenize(text: String, stopwords: Set[String]): Set[String] = {
  if (text == null) Set.empty
  else text
    .toLowerCase
    .replaceAll(DELIMS, " ") // use your defined DELIMS
    .split("\\s+")
    .filter(t => t.length > 1 && !stopwords.contains(t))
    .toSet
}
val tokenized = parsed.map {
  case (category, text) =>
    val tokens = tokenize(text, stopwordsBroadcast.value)
    (category, tokens)
}.cache()
println(f"Tokenizer Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- COUNTING ---
start = System.nanoTime()
println("Counting")
val tokenCatAndDocStats = tokenized.flatMap {
  case (cat, tokens) =>
    val tokenSet = tokens.toSet
    val tokenPairs = tokenSet.map(token => ((token, cat), 1))
    val docMarker = Seq((("!DOC_COUNT", cat), 1))
    tokenPairs.toSeq ++ docMarker
}.reduceByKey(_ + _).cache()
println(f"   tokenCatAndDocStats: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
val docCounts = tokenCatAndDocStats
  .filter { case ((key, _), _) => key == "!DOC_COUNT" }
  .map { case ((_, cat), count) => (cat, count) }
  .reduceByKey(_ + _)  // aggregate locally on each node to reduce data transferred
  .collectAsMap()
println(f"   docCounts: ${(System.nanoTime() - start) / 1e9}%.3f sec")

val totalDocs = docCounts.values.sum
val docCountsBroadcast = sc.broadcast(docCounts)
val totalDocsBroadcast = sc.broadcast(totalDocs)

start = System.nanoTime()
val tokenCatCounts = tokenCatAndDocStats
    .filter(_._1._1 != "!DOC_COUNT")
    .cache()
println(f"   tokenCatCounts: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
val tokenTotals = tokenCatCounts
  .map { case ((token, _), count) => (token, count) }
  .reduceByKey(_ + _)
  .collect()  // Not collectAsMap()
val tokenTotalsMap = tokenTotals.toMap  // Convert locally
val tokenTotalsBroadcast = sc.broadcast(tokenTotalsMap)
println(f"   tokenTotalsBroadcast: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- CHI-SQUARE CALCULATION ---
start = System.nanoTime()
val N = totalDocsBroadcast.value.toDouble
val chi2Scores = tokenCatCounts.map {
  case ((token, cat), aCount) =>
    val A = aCount.toDouble
    val T = tokenTotalsBroadcast.value.getOrElse(token, 0).toDouble
    val C = docCountsBroadcast.value.getOrElse(cat, 0).toDouble
    val B = T - A
    val D = N - C - B - A
    val denom = (A + B) * (C + D) * (A + C) * (B + D)
    val chi2 = if (denom == 0) 0.0 else N * math.pow((A * D - B * C), 2) / denom
    (cat, (token, chi2))
}.cache()
println(f"Chi-Square Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

// --- TOP K TERMS PER CATEGORY ---
start = System.nanoTime()
val partitioned = chi2Scores.repartition(200)
val topTokensPerCategory = partitioned
  .groupByKey()
  .mapValues(iter => iter.toSeq.sortBy(-_._2).take(TOP_K))
  .cache()
println(f"Top-K Selection Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
// merging vocabluary
val mergedVocab = topTokensPerCategory.flatMap(_._2.map(_._1)).distinct().collect().sorted
println(f"Merge Vocab Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

start = System.nanoTime()
//format output as: <category> [term:chi2]
val output = topTokensPerCategory.map { case (category, terms) =>
  val formattedTerms = terms.map { case (term, chi2) =>
    s"$term:$chi2"
  }.mkString(" ")
  s"<$category> $formattedTerms"
}

//create and save to output file
val file = File(outputPath)
file.writeAll(output.collect().mkString("\n"))

// Append the sorted vocab to file
file.appendAll("\n" + mergedVocab.mkString(" "))

println(f"Export Time: ${(System.nanoTime() - start) / 1e9}%.3f sec")

println(f"Total Runtime: ${(System.nanoTime() - start0) / 1e9}%.3f sec")

Parsing Time: 0.667 sec
Tokenizer Time: 0.041 sec
Counting
   tokenCatAndDocStats: 0.027 sec
   docCounts: 7.513 sec
   tokenCatCounts: 0.013 sec
   tokenTotalsBroadcast: 1.638 sec
Chi-Square Time: 0.012 sec
Top-K Selection Time: 0.079 sec
Merge Vocab Time: 10.756 sec
Export Time: 0.597 sec
Total Runtime: 21.361 sec


start0: Long = 12238132894274467
start: Long = 12238153658092396
stopwords: scala.collection.immutable.Set[String] = Set(serious, latterly, absorbs, looks, particularly, used, e, printer, down, regarding, entirely, regardless, moreover, please, read, ourselves, able, behind, for, despite, s, maybe, viz, further, corresponding, x, any, wherein, across, name, allows, this, instead, in, taste, ought, myself, have, your, off, once, are, is, mon, his, oh, why, rd, knows, bulbs, too, among, course, greetings, somewhat, bibs, everyone, seen, likely, said, try, already, soon, nobody, got, given, song, using, less, am, consider, hence, than, n, accordingly, four, anyhow, want, three, forth, whereby, himself, specify, yes, throughout, inasmuch, but, whether, sure, below, aren, co, best, plus, bec...


dev set

Parsing Time: 0.667 sec
Tokenizer Time: 0.041 sec
Counting
   tokenCatAndDocStats: 0.027 sec
   docCounts: 7.513 sec
   tokenCatCounts: 0.013 sec
   tokenTotalsBroadcast: 1.638 sec
Chi-Square Time: 0.012 sec
Top-K Selection Time: 0.079 sec
Merge Vocab Time: 10.756 sec
Export Time: 0.597 sec
Total Runtime: 21.361 sec