Skip to content

Commit

Permalink
- add minimal unit tests
Browse files Browse the repository at this point in the history
 - add benchmark results
  • Loading branch information
RayanRal committed Jun 3, 2023
1 parent ad0487e commit 17afcce
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 13 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
# indexVerse
Simple inverted index generator in Scala


## Benchmarking results

[info] Benchmark (numThreads) Mode Cnt Score Error Units
[info] BenchmarkIndex.testIndexing 1 avgt 6 4.300 ± 0.760 s/op
[info] BenchmarkIndex.testIndexing 2 avgt 6 2.177 ± 0.176 s/op
[info] BenchmarkIndex.testIndexing 4 avgt 6 1.249 ± 0.058 s/op
[info] BenchmarkIndex.testIndexing 8 avgt 6 0.693 ± 0.019 s/op
3 changes: 3 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ libraryDependencies ++= Seq(
"org.apache.logging.log4j" % "log4j-core" % "2.20.0" % Runtime
)

// benchmarking
enablePlugins(JmhPlugin)

libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.15" % "test"
1 change: 1 addition & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3")
9 changes: 5 additions & 4 deletions src/main/scala/com/gmail/rayanral/IndexGenerationRunner.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.gmail.rayanral

import com.gmail.rayanral.index.IndexGenerator
import com.gmail.rayanral.index.model.InvertedIndex
import com.gmail.rayanral.index.util.{Config, FileOps}
import org.apache.logging.log4j.scala.Logging

Expand All @@ -18,15 +19,16 @@ object IndexGenerationRunner extends Logging {
|Input extension: ${config.fileExtension}
|Output path: ${config.outputPath}
|""".stripMargin)
runIndexer(config.inputDir, config.fileExtension, config.numberOfIndexerThreads)
val index = runIndexer(config.inputDir, config.fileExtension, config.numberOfIndexerThreads)
logger.info(IndexDisplay.printTopWords(index))
case None =>
}
}

private def runIndexer(inputDir: String, extension: String, numThreads: Int): Unit = {
def runIndexer(inputDir: String, extension: String, numThreads: Int): InvertedIndex = {
val filesToIndex = FileOps.getFilesToIndex(inputDir, extension)
val groupSize = getGroupSize(filesToIndex, numThreads)
val index = filesToIndex
filesToIndex
.grouped(groupSize)
.toList
.par
Expand All @@ -37,7 +39,6 @@ object IndexGenerationRunner extends Logging {
.reduce { (i1, i2) =>
i1.mergeInPlace(i2)
}
logger.info(IndexDisplay.printTopWords(index))
}

private def getGroupSize(filesToIndex: List[String], numThreads: Int): Int = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.gmail.rayanral.index.benchmark

import com.gmail.rayanral.IndexGenerationRunner
import com.gmail.rayanral.index.util.Config
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

@BenchmarkMode(Array(Mode.AverageTime))
@Fork(value = 2)
@Warmup(iterations = 1)
@Measurement(iterations = 3)
@State(Scope.Benchmark)
class BenchmarkIndex {

@Param(Array("1", "2", "4", "8"))
var numThreads = 0

@Benchmark
def testIndexing(blackHole: Blackhole): Unit = {
val index = IndexGenerationRunner.runIndexer(
inputDir = Config.DEFAULT_INPUT_DIR,
extension = Config.DEFAULT_EXTENSION,
numThreads = numThreads
)
blackHole.consume(index)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,6 @@ class InvertedIndex(private val tokenIndex: mutable.Map[String, mutable.Set[Stri
.withDefaultValue(mutable.Set.empty[String])
)

def mergeInPlace(other: InvertedIndex): InvertedIndex = {
other.tokenIndex.flatMap { case (k, set) =>
set.map(v => (k, v))
}.foreach { case (token, filename) =>
add(token, filename)
}
this
}

def add(token: String, filename: String): Unit = {
if (token == null) return
tokenIndex.updateWith(token) {
Expand All @@ -40,4 +31,13 @@ class InvertedIndex(private val tokenIndex: mutable.Map[String, mutable.Set[Stri

def getFilesForToken(token: String): Set[String] = tokenIndex.getOrElse(token, Set.empty[String]).toSet

def mergeInPlace(other: InvertedIndex): InvertedIndex = {
other.tokenIndex.flatMap { case (k, set) =>
set.map(v => (k, v))
}.foreach { case (token, filename) =>
add(token, filename)
}
this
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package com.gmail.rayanral.index.model

import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers._

class InvertedIndexTest extends AnyFlatSpec {

"invertedIndex" should "add a word" in {
val index = new InvertedIndex()
val wordToAdd = "word"
index.add(wordToAdd, "file")
index.getTopWords(1).map(_._1) should contain(wordToAdd)
}

"invertedIndex" should "add several words with correct counts" in {
val index = new InvertedIndex()
val duplicate = "word"
val wordsToAdd = List(duplicate, duplicate, "anotherWord")
wordsToAdd.foreach(index.add(_, "file"))
index.getTopWords(1).head._1 shouldBe duplicate
}

}

0 comments on commit 17afcce

Please sign in to comment.