Skip to content

Commit 59adcaf

Browse files
committed
Try to reimplement BPE in Kotlin instead, bump to v0.4.0
1 parent 9299fab commit 59adcaf

File tree

7 files changed

+478
-462
lines changed

7 files changed

+478
-462
lines changed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
unitytranslate_version = 0.3.4
1+
unitytranslate_version = 0.4.0
22

33
# This isn't actually used in the download process, however it's used for the sake of caching the downloaded files.
44
# https://github.com/OpenNMT/CTranslate2/blob/master/python/ctranslate2/version.py

library/build.gradle.kts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
plugins {
2-
kotlin("jvm") version "2.1.0"
3-
kotlin("plugin.serialization") version "2.1.0"
2+
kotlin("jvm") version "2.2.0"
3+
kotlin("plugin.serialization") version "2.2.0"
44
}
55

66
repositories {
@@ -9,12 +9,14 @@ repositories {
99

1010
dependencies {
1111
testImplementation(kotlin("test"))
12-
api("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.1")
13-
api("org.jetbrains.kotlinx:kotlinx-coroutines-core-jvm:1.10.1")
14-
api("org.jetbrains.kotlinx:kotlinx-coroutines-jdk8:1.10.1")
15-
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.8.0-RC")
12+
api("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2")
13+
api("org.jetbrains.kotlinx:kotlinx-coroutines-core-jvm:1.10.2")
14+
api("org.jetbrains.kotlinx:kotlinx-coroutines-jdk8:1.10.2")
15+
api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.8.1")
1616
api("org.slf4j:slf4j-api:2.0.16")
1717

18+
api("com.mayakapps.kache:kache:2.1.1")
19+
1820
testRuntimeOnly("org.slf4j:slf4j-simple:2.0.16")
1921
}
2022

library/src/main/kotlin/xyz/bluspring/unitytranslate/library/UnityTranslateLib.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class UnityTranslateLib(val path: Path) {
5555
}
5656

5757
@ApiStatus.Internal
58-
external fun loadModel(modelPath: String, spModelPath: String?, bpeModelPath: String?, useCuda: Boolean): Long
58+
external fun loadModel(toLang: String, modelPath: String, spModelPath: String?, bpeModelPath: String?, useCuda: Boolean): Long
5959

6060
@ApiStatus.Internal
6161
external fun batchTranslate(modelPtr: Long, textArray: Array<String>): Array<String>

library/src/main/kotlin/xyz/bluspring/unitytranslate/library/models/ModelPackageManager.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class ModelPackageManager(val library: UnityTranslateLib) {
7070
}
7171

7272
infos.toList().asFlow().concurrent().collect { (pkg, modelInfo) ->
73-
val modelPtr = library.loadModel(modelInfo.modelPath.absolutePathString(), modelInfo.spModelPath?.absolutePathString(), modelInfo.bpeModelPath?.absolutePathString(), useCuda)
73+
val modelPtr = library.loadModel(modelInfo.code.split("_")[1], modelInfo.modelPath.absolutePathString(), modelInfo.spModelPath?.absolutePathString(), modelInfo.bpeModelPath?.absolutePathString(), useCuda)
7474

7575
if (modelPtr != 0L) {
7676
loadedModelPtrs[pkg.code] = modelPtr

library/src/main/kotlin/xyz/bluspring/unitytranslate/library/util/BPETokenizer.kt

Lines changed: 387 additions & 0 deletions
Large diffs are not rendered by default.

native/src/bpe.rs

Lines changed: 74 additions & 448 deletions
Large diffs are not rendered by default.

native/src/lib.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ use crate::bpe::BPETokenizer;
1313
// I know I'm not good at Rust.
1414
// But at least it's functional.... I think.
1515

16-
struct UnityTranslateTokenizer {
16+
struct UnityTranslateTokenizer<'tokenizer> {
1717
sentence_piece_tokenizer: Option<SentencePieceTokenizer>,
18-
bpe_tokenizer: Option<BPETokenizer>
18+
bpe_tokenizer: Option<BPETokenizer<'tokenizer>>
1919
}
2020

21-
impl Tokenizer for UnityTranslateTokenizer {
21+
impl<'tokenizer> Tokenizer for UnityTranslateTokenizer<'tokenizer> {
2222
fn encode(&self, input: &str) -> anyhow::Result<Vec<String>> {
2323
if let Some(sp) = &self.sentence_piece_tokenizer {
2424
let result = sp.tokenize(input);
@@ -53,7 +53,7 @@ impl Tokenizer for UnityTranslateTokenizer {
5353
#[no_mangle]
5454
pub extern "system" fn Java_xyz_bluspring_unitytranslate_library_UnityTranslateLib_loadModel<'local>(
5555
mut env: JNIEnv<'local>, class: JClass<'local>,
56-
modelPath: JString<'local>, spModelPath: JString<'local>, bpeModelPath: JString<'local>,
56+
toLang: JString<'local>, modelPath: JString<'local>, spModelPath: JString<'local>, bpeModelPath: JString<'local>,
5757
useCuda: jboolean,
5858
) -> jlong {
5959
let modelPathValue: String = String::from(env.get_string(&modelPath).expect("Couldn't get java string value"));
@@ -73,7 +73,8 @@ pub extern "system" fn Java_xyz_bluspring_unitytranslate_library_UnityTranslateL
7373
} else if !bpeModelPath.is_null() {
7474
let bpeModelValue: String = String::from(env.get_string(&bpeModelPath).unwrap());
7575
let bpeModelData = fs::read_to_string(bpeModelValue).expect("Couldn't read BPE model file!");
76-
let tokenizer = BPETokenizer::new(bpeModelData.as_str());
76+
let toLangValue = String::from(env.get_string(&toLang).unwrap());
77+
let tokenizer = BPETokenizer::new(env, toLangValue.as_str(), bpeModelData.as_str());
7778

7879
UnityTranslateTokenizer { sentence_piece_tokenizer: None, bpe_tokenizer: Some(tokenizer) }
7980
} else {

0 commit comments

Comments
 (0)