[Rust] voicevox_tts と voicevox_wav_free の実装 (#186)

* implements create_accent_phrases * implements synthesis * implements synthesis_wave_format * implements voicevox_tts and voicevox_wav_free * resolve clippy warning * 音声合成できるように修正 * wavのためのバッファのvecをwith_capacityでメモリ確保 * Internalのメソッドの引数としてCStrの代わりにstrを使う * Dissolveを使わない * UTF-8文字列としてデコードできない場合のエラーをハンドリングする
VOICEVOX · Jul 16, 2022 · ce9d36b · ce9d36b
1 parent 0357912
commit ce9d36b
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 52 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml
@@ -16,6 +16,7 @@ anyhow = "1.0.57"
 cfg-if = "1.0.0"
 derive-getters = "0.2.0"
 derive-new = "0.5.9"
+libc = "0.2.126"
 once_cell = "1.10.0"
 onnxruntime = { git = "https://github.com/qwerty2501/onnxruntime-rs.git", version = "0.0.24" }
 serde = "1.0.137"

diff --git a/crates/voicevox_core/src/c_export.rs b/crates/voicevox_core/src/c_export.rs
@@ -1,5 +1,6 @@
 use super::*;
 use internal::Internal;
+use libc::c_void;
 use once_cell::sync::Lazy;
 use std::ffi::CStr;
 use std::os::raw::{c_char, c_int};
@@ -33,6 +34,8 @@ pub enum VoicevoxResultCode {
     VOICEVOX_RESULT_INVALID_SPEAKER_ID = 7,
     VOICEVOX_RESULT_INVALID_MODEL_INDEX = 8,
     VOICEVOX_RESULT_INFERENCE_FAILED = 9,
+    VOICEVOX_RESULT_FAILED_EXTRACT_FULL_CONTEXT_LABEL = 10,
+    VOICEVOX_RESULT_INVALID_UTF8_INPUT = 11,
 }
 
 fn convert_result<T>(result: Result<T>) -> (Option<T>, VoicevoxResultCode) {
@@ -73,6 +76,10 @@ fn convert_result<T>(result: Result<T>) -> (Option<T>, VoicevoxResultCode) {
                 Error::InferenceFailed => {
                     (None, VoicevoxResultCode::VOICEVOX_RESULT_INFERENCE_FAILED)
                 }
+                Error::FailedExtractFullContextLabel(_) => (
+                    None,
+                    VoicevoxResultCode::VOICEVOX_RESULT_FAILED_EXTRACT_FULL_CONTEXT_LABEL,
+                ),
             }
         }
     }
@@ -233,9 +240,13 @@ pub extern "C" fn decode_forward(
 
 #[no_mangle]
 pub extern "C" fn voicevox_load_openjtalk_dict(dict_path: *const c_char) -> VoicevoxResultCode {
-    let (_, result_code) = convert_result(
-        lock_internal().voicevox_load_openjtalk_dict(unsafe { CStr::from_ptr(dict_path) }),
-    );
+    let (_, result_code) = {
+        if let Ok(dict_path) = unsafe { CStr::from_ptr(dict_path) }.to_str() {
+            convert_result(lock_internal().voicevox_load_openjtalk_dict(dict_path))
+        } else {
+            (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT)
+        }
+    };
     result_code
 }
 
@@ -246,12 +257,21 @@ pub extern "C" fn voicevox_tts(
     output_binary_size: *mut c_int,
     output_wav: *mut *mut u8,
 ) -> VoicevoxResultCode {
-    let (_, result_code) = convert_result(lock_internal().voicevox_tts(
-        unsafe { CStr::from_ptr(text) },
-        speaker_id,
-        output_binary_size,
-        output_wav,
-    ));
+    let (output_opt, result_code) = {
+        if let Ok(text) = unsafe { CStr::from_ptr(text) }.to_str() {
+            convert_result(lock_internal().voicevox_tts(text, speaker_id as usize))
+        } else {
+            (None, VoicevoxResultCode::VOICEVOX_RESULT_INVALID_UTF8_INPUT)
+        }
+    };
+    if let Some(output) = output_opt {
+        unsafe {
+            output_binary_size.write(output.len() as c_int);
+            let wav_heap = libc::malloc(output.len());
+            libc::memcpy(wav_heap, output.as_ptr() as *const c_void, output.len());
+            output_wav.write(wav_heap as *mut u8);
+        }
+    }
     result_code
 }
 
@@ -272,9 +292,10 @@ pub extern "C" fn voicevox_tts_from_kana(
 }
 
 #[no_mangle]
-pub extern "C" fn voicevox_wav_free(wav: *mut u8) -> VoicevoxResultCode {
-    let (_, result_code) = convert_result(lock_internal().voicevox_wav_free(wav));
-    result_code
+pub extern "C" fn voicevox_wav_free(wav: *mut u8) {
+    unsafe {
+        libc::free(wav as *mut c_void);
+    }
 }
 
 #[no_mangle]

diff --git a/crates/voicevox_core/src/engine/full_context_label.rs b/crates/voicevox_core/src/engine/full_context_label.rs
@@ -40,11 +40,13 @@ static I3_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(@(\d+|xx)\+)").unwrap(
 static J1_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(/J:(\d+|xx)_)").unwrap());
 
 fn string_feature_by_regex(re: &Regex, label: &str) -> Result<String> {
-    re.find(label)
-        .map(|m| m.as_str().to_string())
-        .ok_or_else(|| FullContextLabelError::LabelParse {
+    if let Some(caps) = re.captures(label) {
+        Ok(caps.get(2).unwrap().as_str().to_string())
+    } else {
+        Err(FullContextLabelError::LabelParse {
             label: label.into(),
         })
+    }
 }
 
 #[allow(dead_code)]

diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs
@@ -9,5 +9,6 @@ mod synthesis_engine;
 use super::*;
 
 pub use acoustic_feature_extractor::*;
+pub use full_context_label::*;
 pub use model::*;
 pub use synthesis_engine::*;