From 17afcceab7a9fa1a78e2895fc847c8fec054c2f8 Mon Sep 17 00:00:00 2001 From: leonidchashnikov Date: Sat, 3 Jun 2023 18:30:15 +0100 Subject: [PATCH] - add minimal unit tests - add benchmark results --- README.md | 9 ++++++ build.sbt | 3 ++ project/plugins.sbt | 1 + .../rayanral/IndexGenerationRunner.scala | 9 +++--- .../index/benchmark/BenchmarkIndex.scala | 28 +++++++++++++++++++ .../rayanral/index/model/InvertedIndex.scala | 18 ++++++------ .../index/model/InvertedIndexTest.scala | 23 +++++++++++++++ 7 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 project/plugins.sbt create mode 100644 src/main/scala/com/gmail/rayanral/index/benchmark/BenchmarkIndex.scala create mode 100644 src/test/scala/com/gmail/rayanral/index/model/InvertedIndexTest.scala diff --git a/README.md b/README.md index 74be0b8..1210750 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,11 @@ # indexVerse Simple inverted index generator in Scala + + +## Benchmarking results + +[info] Benchmark (numThreads) Mode Cnt Score Error Units +[info] BenchmarkIndex.testIndexing 1 avgt 6 4.300 ± 0.760 s/op +[info] BenchmarkIndex.testIndexing 2 avgt 6 2.177 ± 0.176 s/op +[info] BenchmarkIndex.testIndexing 4 avgt 6 1.249 ± 0.058 s/op +[info] BenchmarkIndex.testIndexing 8 avgt 6 0.693 ± 0.019 s/op diff --git a/build.sbt b/build.sbt index aa65c77..14b637b 100644 --- a/build.sbt +++ b/build.sbt @@ -23,4 +23,7 @@ libraryDependencies ++= Seq( "org.apache.logging.log4j" % "log4j-core" % "2.20.0" % Runtime ) +// benchmarking +enablePlugins(JmhPlugin) + libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.15" % "test" \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..7e06085 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3") diff --git a/src/main/scala/com/gmail/rayanral/IndexGenerationRunner.scala b/src/main/scala/com/gmail/rayanral/IndexGenerationRunner.scala index 3e1e9b3..2605f80 100644 --- a/src/main/scala/com/gmail/rayanral/IndexGenerationRunner.scala +++ b/src/main/scala/com/gmail/rayanral/IndexGenerationRunner.scala @@ -1,6 +1,7 @@ package com.gmail.rayanral import com.gmail.rayanral.index.IndexGenerator +import com.gmail.rayanral.index.model.InvertedIndex import com.gmail.rayanral.index.util.{Config, FileOps} import org.apache.logging.log4j.scala.Logging @@ -18,15 +19,16 @@ object IndexGenerationRunner extends Logging { |Input extension: ${config.fileExtension} |Output path: ${config.outputPath} |""".stripMargin) - runIndexer(config.inputDir, config.fileExtension, config.numberOfIndexerThreads) + val index = runIndexer(config.inputDir, config.fileExtension, config.numberOfIndexerThreads) + logger.info(IndexDisplay.printTopWords(index)) case None => } } - private def runIndexer(inputDir: String, extension: String, numThreads: Int): Unit = { + def runIndexer(inputDir: String, extension: String, numThreads: Int): InvertedIndex = { val filesToIndex = FileOps.getFilesToIndex(inputDir, extension) val groupSize = getGroupSize(filesToIndex, numThreads) - val index = filesToIndex + filesToIndex .grouped(groupSize) .toList .par @@ -37,7 +39,6 @@ object IndexGenerationRunner extends Logging { .reduce { (i1, i2) => i1.mergeInPlace(i2) } - logger.info(IndexDisplay.printTopWords(index)) } private def getGroupSize(filesToIndex: List[String], numThreads: Int): Int = { diff --git a/src/main/scala/com/gmail/rayanral/index/benchmark/BenchmarkIndex.scala b/src/main/scala/com/gmail/rayanral/index/benchmark/BenchmarkIndex.scala new file mode 100644 index 0000000..be7961c --- /dev/null +++ b/src/main/scala/com/gmail/rayanral/index/benchmark/BenchmarkIndex.scala @@ -0,0 +1,28 @@ +package com.gmail.rayanral.index.benchmark + +import com.gmail.rayanral.IndexGenerationRunner +import com.gmail.rayanral.index.util.Config +import org.openjdk.jmh.annotations._ +import org.openjdk.jmh.infra.Blackhole + +@BenchmarkMode(Array(Mode.AverageTime)) +@Fork(value = 2) +@Warmup(iterations = 1) +@Measurement(iterations = 3) +@State(Scope.Benchmark) +class BenchmarkIndex { + + @Param(Array("1", "2", "4", "8")) + var numThreads = 0 + + @Benchmark + def testIndexing(blackHole: Blackhole): Unit = { + val index = IndexGenerationRunner.runIndexer( + inputDir = Config.DEFAULT_INPUT_DIR, + extension = Config.DEFAULT_EXTENSION, + numThreads = numThreads + ) + blackHole.consume(index) + } + +} diff --git a/src/main/scala/com/gmail/rayanral/index/model/InvertedIndex.scala b/src/main/scala/com/gmail/rayanral/index/model/InvertedIndex.scala index b6562ea..eac9766 100644 --- a/src/main/scala/com/gmail/rayanral/index/model/InvertedIndex.scala +++ b/src/main/scala/com/gmail/rayanral/index/model/InvertedIndex.scala @@ -10,15 +10,6 @@ class InvertedIndex(private val tokenIndex: mutable.Map[String, mutable.Set[Stri .withDefaultValue(mutable.Set.empty[String]) ) - def mergeInPlace(other: InvertedIndex): InvertedIndex = { - other.tokenIndex.flatMap { case (k, set) => - set.map(v => (k, v)) - }.foreach { case (token, filename) => - add(token, filename) - } - this - } - def add(token: String, filename: String): Unit = { if (token == null) return tokenIndex.updateWith(token) { @@ -40,4 +31,13 @@ class InvertedIndex(private val tokenIndex: mutable.Map[String, mutable.Set[Stri def getFilesForToken(token: String): Set[String] = tokenIndex.getOrElse(token, Set.empty[String]).toSet + def mergeInPlace(other: InvertedIndex): InvertedIndex = { + other.tokenIndex.flatMap { case (k, set) => + set.map(v => (k, v)) + }.foreach { case (token, filename) => + add(token, filename) + } + this + } + } diff --git a/src/test/scala/com/gmail/rayanral/index/model/InvertedIndexTest.scala b/src/test/scala/com/gmail/rayanral/index/model/InvertedIndexTest.scala new file mode 100644 index 0000000..999abc9 --- /dev/null +++ b/src/test/scala/com/gmail/rayanral/index/model/InvertedIndexTest.scala @@ -0,0 +1,23 @@ +package com.gmail.rayanral.index.model + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers._ + +class InvertedIndexTest extends AnyFlatSpec { + + "invertedIndex" should "add a word" in { + val index = new InvertedIndex() + val wordToAdd = "word" + index.add(wordToAdd, "file") + index.getTopWords(1).map(_._1) should contain(wordToAdd) + } + + "invertedIndex" should "add several words with correct counts" in { + val index = new InvertedIndex() + val duplicate = "word" + val wordsToAdd = List(duplicate, duplicate, "anotherWord") + wordsToAdd.foreach(index.add(_, "file")) + index.getTopWords(1).head._1 shouldBe duplicate + } + +}