In [91]:
@file:DependsOn("com.fasterxml.jackson.core:jackson-databind:2.15.2")
import java.io.*
import com.fasterxml.jackson.databind.*


In [141]:
fun readLines(fileName: String, folder: String? = "raw"): List<String> {
    return File("../data/$folder/$fileName").readLines(Charsets.UTF_8)
}

In [142]:
fun sentsToFile(sentences: List<String>, file: String): Unit {
    val fileWriter = FileWriter("../data/formatted/$file")
    fileWriter.use {
        sentences.forEach {
            fileWriter.write(it + System.lineSeparator())
        }
    }
}

In [143]:
fun readAndMapConllu(fileName: String): List<String> {
    val lines = readLines(fileName)
    val relevantLines = lines.filter { it.startsWith("# newdoc") || it.startsWith("# text") }
    return relevantLines
        .map {
            if (it.startsWith("# newdoc")) {
                ""
            } else if (it.startsWith("# text")) {
                it.replace("# text = ", "")
            } else {
                throw RuntimeException("this should not exist: $it")
            }
        }
        // Remove initial empty line
        .drop(1)
}


In [144]:
val mapper = ObjectMapper()
fun readAndMapNer(fileName: String): List<String> {
    val lines = readLines(fileName)
    val sentences: MutableList<String> = lines.map {
        val node = mapper.readTree(it)
        val tokenList = node.path("tokens")
        val sent = tokenList.elements().asSequence()
            .map { it.asText() }
            .toList()
            .joinToString(" ")
            .replace("\\s[,.?!:;]".toRegex()) {
                it.value.trimStart()
            }
            .replace("(\\()(\\s.*\\s)(\\))".toRegex()) {
                it.groupValues[1] + it.groupValues[2].trim() + it.groupValues[3]
            }
            .replace("(\")(\\s.*\\s)(\")".toRegex()) {
                it.groupValues[1] + it.groupValues[2].trim() + it.groupValues[3]
            }

            if (sent.startsWith("\" ")) {
                sent.replaceFirst(" ", "")
            } else {
                sent
            }

    }.toMutableList()

    for (i in sentences.size downTo 0 step 10) {
        if (i > 9 && i < sentences.size) {
            sentences.add(i, "")
        }
    }

    return sentences
}


In [85]:
// val tbTrain = readAndMapConllu("sv_talbanken-ud-train.conllu")
//sentsToFile(tbTrain, "sv_talbanken-se-sent.train.tmp")
// val tbTest = readAndMapConllu("sv_talbanken-ud-test.conllu")
//sentsToFile(tbTest, "sv_talbanken-se-sent.eval")


In [138]:
val sucTrain = readAndMapNer("suc-sprakbanken-cased-train.jsonl")
sentsToFile(sucTrain, "suc-sprakbanken-cased.train")
val sucTest = readAndMapNer("suc-sprakbanken-cased-test.jsonl")
sentsToFile(sucTest, "suc-sprakbanken-cased.eval")



In [145]:
fun mergeFiles(vararg files: String): List<String> {
    return files
        .flatMap {
            readLines(it, "formatted")
        }
        .toList()
}


In [146]:
val mergedTrain = mergeFiles("sv_talbanken-se-sent.train", "suc-sprakbanken-cased.train")
sentsToFile(mergedTrain, "tb-plus-suc-se-sent.train")


In [147]:
val mergedTest = mergeFiles("sv_talbanken-se-sent.eval", "suc-sprakbanken-cased.eval")
sentsToFile(mergedTest, "tb-plus-suc-se-sent.eval")
