diff --git a/openless-all/app/src-tauri/src/asr/bailian.rs b/openless-all/app/src-tauri/src/asr/bailian.rs new file mode 100644 index 00000000..484a9125 --- /dev/null +++ b/openless-all/app/src-tauri/src/asr/bailian.rs @@ -0,0 +1,592 @@ +//! Alibaba Cloud Bailian / DashScope realtime ASR client. +//! +//! Uses the classic DashScope realtime recognition WebSocket protocol +//! (`/api-ws/v1/inference`) because it accepts raw 16 kHz mono PCM frames and +//! matches OpenLess' recorder output directly. The Qwen OpenAI Realtime line is +//! a different protocol and is intentionally left for a follow-up provider. + +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use futures_util::{SinkExt, StreamExt}; +use parking_lot::Mutex as ParkingMutex; +use serde_json::{json, Value}; +use tokio::net::TcpStream; +use tokio::runtime::Handle; +use tokio::sync::{mpsc, oneshot, Mutex as AsyncMutex, Notify}; +use tokio_tungstenite::tungstenite::client::IntoClientRequest; +use tokio_tungstenite::tungstenite::http::header::HeaderValue; +use tokio_tungstenite::tungstenite::Message; +use tokio_tungstenite::{connect_async, MaybeTlsStream, WebSocketStream}; +use uuid::Uuid; + +use super::{AudioConsumer, RawTranscript}; + +pub const PROVIDER_ID: &str = "bailian"; +pub const DEFAULT_ENDPOINT: &str = "wss://dashscope.aliyuncs.com/api-ws/v1/inference/"; +pub const DEFAULT_MODEL: &str = "fun-asr-realtime"; + +/// 100 ms of 16 kHz / 16-bit / mono PCM. +pub const TARGET_AUDIO_CHUNK_BYTES: usize = 3_200; +const BYTES_PER_MS: u64 = 32; +const FINAL_RESULT_TIMEOUT: Duration = Duration::from_secs(12); + +type WsStream = WebSocketStream>; +type WsSink = futures_util::stream::SplitSink; +type SharedWriter = Arc>>; + +#[derive(Clone, Debug)] +pub struct BailianCredentials { + pub api_key: String, + pub endpoint: String, + pub model: String, + pub vocabulary_id: Option, +} + +impl BailianCredentials { + pub fn normalized_endpoint(&self) -> String { + if self.endpoint.trim().is_empty() { + return DEFAULT_ENDPOINT.to_string(); + } + self.endpoint.trim().to_string() + } + + pub fn normalized_model(&self) -> String { + let model = self.model.trim(); + if model.is_empty() { + DEFAULT_MODEL.to_string() + } else { + model.to_string() + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum BailianASRError { + #[error("credentials missing")] + CredentialsMissing, + #[error("connection failed: {0}")] + ConnectionFailed(String), + #[error("send failed: {0}")] + SendFailed(String), + #[error("task failed: {0}")] + TaskFailed(String), + #[error("no final result")] + NoFinalResult, + #[error("final result timed out")] + FinalResultTimeout, +} + +enum SendItem { + Audio(Vec), + Finish(oneshot::Sender>), +} + +#[derive(Default)] +struct SyncState { + task_id: String, + pending_audio: Vec, + audio_scratch: Vec, + bytes_received: u64, + task_started: bool, + task_finished: bool, + runtime: Option, + start: Option, + final_tx: Option>>, + send_tx: Option>, + final_segments: Vec, + last_result_text: String, +} + +pub struct BailianRealtimeASR { + credentials: BailianCredentials, + state: ParkingMutex, + writer: SharedWriter, + final_rx: ParkingMutex>>>, + task_started: Arc, +} + +impl BailianRealtimeASR { + pub fn new(credentials: BailianCredentials) -> Self { + Self { + credentials, + state: ParkingMutex::new(SyncState::default()), + writer: Arc::new(AsyncMutex::new(None)), + final_rx: ParkingMutex::new(None), + task_started: Arc::new(Notify::new()), + } + } + + pub async fn open_session(self: &Arc) -> Result<(), BailianASRError> { + if self.credentials.api_key.trim().is_empty() { + return Err(BailianASRError::CredentialsMissing); + } + + let task_id = Uuid::new_v4().simple().to_string(); + let endpoint = self.credentials.normalized_endpoint(); + let mut request = endpoint + .into_client_request() + .map_err(|e| BailianASRError::ConnectionFailed(e.to_string()))?; + request.headers_mut().insert( + "Authorization", + HeaderValue::from_str(&format!("bearer {}", self.credentials.api_key.trim())) + .map_err(|e| BailianASRError::ConnectionFailed(e.to_string()))?, + ); + + let (ws, _resp) = connect_async(request) + .await + .map_err(|e| BailianASRError::ConnectionFailed(e.to_string()))?; + let (write, read) = ws.split(); + *self.writer.lock().await = Some(write); + + let (final_tx, final_rx) = oneshot::channel(); + let (send_tx, mut send_rx) = mpsc::unbounded_channel::(); + { + let mut st = self.state.lock(); + *st = SyncState::default(); + st.task_id = task_id.clone(); + st.runtime = Some(Handle::current()); + st.start = Some(Instant::now()); + st.final_tx = Some(final_tx); + st.send_tx = Some(send_tx); + } + *self.final_rx.lock() = Some(final_rx); + + let writer_for_worker = Arc::clone(&self.writer); + let task_id_for_worker = task_id.clone(); + tokio::spawn(async move { + while let Some(item) = send_rx.recv().await { + match item { + SendItem::Audio(chunk) => { + if let Err(e) = send_binary(&writer_for_worker, chunk).await { + log::error!("[bailian-asr] audio frame send failed: {e}"); + } + } + SendItem::Finish(done) => { + let result = + send_text(&writer_for_worker, finish_task_message(&task_id_for_worker)) + .await + .map_err(|e| BailianASRError::SendFailed(e.to_string())); + let _ = done.send(result); + } + } + } + }); + + send_text( + &self.writer, + run_task_message( + &task_id, + &self.credentials.normalized_model(), + self.credentials.vocabulary_id.as_deref(), + ), + ) + .await?; + + let weak_self = Arc::downgrade(self); + tokio::spawn(async move { + let mut read = read; + while let Some(msg) = read.next().await { + let Some(this) = weak_self.upgrade() else { + break; + }; + match msg { + Ok(Message::Text(text)) => { + if !this.handle_text_message(&text) { + break; + } + } + Ok(Message::Close(_)) => { + this.finish_with_partial_or_error(BailianASRError::NoFinalResult); + break; + } + Ok(_) => {} + Err(e) => { + log::error!("[bailian-asr] receive loop error: {e}"); + this.finish_with_partial_or_error(BailianASRError::ConnectionFailed( + e.to_string(), + )); + break; + } + } + } + }); + + Ok(()) + } + + pub async fn send_last_frame(&self) -> Result<(), BailianASRError> { + let started = self.task_started.notified(); + tokio::pin!(started); + started.as_mut().enable(); + let ready = { + let st = self.state.lock(); + st.task_started || st.task_finished + }; + if !ready { + tokio::time::timeout(Duration::from_secs(5), started) + .await + .map_err(|_| BailianASRError::FinalResultTimeout)?; + } + let (send_tx, tail_chunks) = { + let mut st = self.state.lock(); + let send_tx = st.send_tx.clone(); + if !st.pending_audio.is_empty() { + let pending = std::mem::take(&mut st.pending_audio); + st.audio_scratch.extend_from_slice(&pending); + } + let tail = if st.audio_scratch.is_empty() { + Vec::new() + } else { + vec![std::mem::take(&mut st.audio_scratch)] + }; + (send_tx, tail) + }; + let Some(send_tx) = send_tx else { + return Ok(()); + }; + for chunk in tail_chunks { + let _ = send_tx.send(SendItem::Audio(chunk)); + } + let (done_tx, done_rx) = oneshot::channel(); + send_tx + .send(SendItem::Finish(done_tx)) + .map_err(|_| BailianASRError::SendFailed("send worker closed".to_string()))?; + done_rx + .await + .map_err(|_| BailianASRError::SendFailed("finish ack dropped".to_string()))? + } + + pub async fn await_final_result(&self) -> Result { + let rx = self.final_rx.lock().take(); + let Some(rx) = rx else { + return Err(BailianASRError::NoFinalResult); + }; + tokio::time::timeout(FINAL_RESULT_TIMEOUT, rx) + .await + .map_err(|_| BailianASRError::FinalResultTimeout)? + .map_err(|_| BailianASRError::NoFinalResult)? + } + + pub fn cancel(&self) { + let mut st = self.state.lock(); + st.pending_audio.clear(); + st.audio_scratch.clear(); + st.send_tx.take(); + st.final_tx.take(); + st.task_finished = true; + drop(st); + let writer = Arc::clone(&self.writer); + if let Ok(handle) = Handle::try_current() { + handle.spawn(async move { + let _ = close_writer(&writer).await; + }); + } else { + std::thread::spawn(move || { + if let Ok(rt) = tokio::runtime::Runtime::new() { + rt.block_on(async move { + let _ = close_writer(&writer).await; + }); + } + }); + } + } + + fn handle_text_message(&self, text: &str) -> bool { + let value: Value = match serde_json::from_str(text) { + Ok(v) => v, + Err(e) => { + log::warn!("[bailian-asr] invalid json event: {e}"); + return true; + } + }; + let event = value + .get("header") + .and_then(|h| h.get("event")) + .and_then(Value::as_str) + .unwrap_or_default(); + match event { + "task-started" => { + self.mark_task_started(); + true + } + "result-generated" => { + self.record_result(&value); + true + } + "task-finished" => { + self.finish_success(); + false + } + "task-failed" => { + let message = value + .get("header") + .and_then(|h| h.get("error_message")) + .and_then(Value::as_str) + .unwrap_or("task failed") + .to_string(); + self.finish_error(BailianASRError::TaskFailed(message)); + false + } + _ => true, + } + } + + fn mark_task_started(&self) { + let (send_tx, chunks) = { + let mut st = self.state.lock(); + st.task_started = true; + if !st.pending_audio.is_empty() { + let pending = std::mem::take(&mut st.pending_audio); + st.audio_scratch.extend_from_slice(&pending); + } + let send_tx = st.send_tx.clone(); + let chunks = drain_audio_chunks(&mut st.audio_scratch); + (send_tx, chunks) + }; + if let Some(tx) = send_tx { + for chunk in chunks { + let _ = tx.send(SendItem::Audio(chunk)); + } + } + self.task_started.notify_waiters(); + } + + fn record_result(&self, value: &Value) { + let sentence = value + .get("payload") + .and_then(|p| p.get("output")) + .and_then(|o| o.get("sentence")); + let Some(sentence) = sentence else { + return; + }; + let Some(text) = sentence.get("text").and_then(Value::as_str) else { + return; + }; + let trimmed = text.trim(); + if trimmed.is_empty() { + return; + } + let is_sentence_final = sentence.get("end_time").is_some(); + let mut st = self.state.lock(); + st.last_result_text = trimmed.to_string(); + if is_sentence_final && st.final_segments.last().map(|s| s.as_str()) != Some(trimmed) { + st.final_segments.push(trimmed.to_string()); + } + } + + fn finish_success(&self) { + let (tx, text, duration_ms) = { + let mut st = self.state.lock(); + if st.task_finished { + return; + } + st.task_finished = true; + st.send_tx.take(); + let text = if st.final_segments.is_empty() { + st.last_result_text.clone() + } else { + st.final_segments.join("") + }; + let duration_ms = if st.bytes_received > 0 { + st.bytes_received / BYTES_PER_MS + } else { + st.start + .map(|start| start.elapsed().as_millis() as u64) + .unwrap_or_default() + }; + (st.final_tx.take(), text, duration_ms) + }; + if let Some(tx) = tx { + let _ = tx.send(Ok(RawTranscript { text, duration_ms })); + } + self.close_on_runtime(); + } + + fn finish_with_partial_or_error(&self, error: BailianASRError) { + let has_partial = { + let st = self.state.lock(); + !st.last_result_text.trim().is_empty() || !st.final_segments.is_empty() + }; + if has_partial { + // 与 Volcengine 保持一致:连接异常但已有 partial 时优先兜底返回,避免丢失用户已识别出的内容。 + self.finish_success(); + } else { + self.finish_error(error); + } + } + + fn finish_error(&self, error: BailianASRError) { + let tx = { + let mut st = self.state.lock(); + if st.task_finished { + return; + } + st.task_finished = true; + st.send_tx.take(); + st.final_tx.take() + }; + if let Some(tx) = tx { + let _ = tx.send(Err(error)); + } + self.close_on_runtime(); + } + + fn close_on_runtime(&self) { + let writer = Arc::clone(&self.writer); + if let Some(handle) = self.state.lock().runtime.clone() { + handle.spawn(async move { + let _ = close_writer(&writer).await; + }); + } + } +} + +impl AudioConsumer for BailianRealtimeASR { + fn consume_pcm_chunk(&self, pcm: &[u8]) { + if pcm.is_empty() { + return; + } + let (send_tx, chunks) = { + let mut st = self.state.lock(); + st.bytes_received = st.bytes_received.saturating_add(pcm.len() as u64); + if !st.task_started { + st.pending_audio.extend_from_slice(pcm); + return; + } + st.audio_scratch.extend_from_slice(pcm); + let chunks = drain_audio_chunks(&mut st.audio_scratch); + (st.send_tx.clone(), chunks) + }; + if let Some(tx) = send_tx { + for chunk in chunks { + let _ = tx.send(SendItem::Audio(chunk)); + } + } + } +} + +fn drain_audio_chunks(buffer: &mut Vec) -> Vec> { + let mut chunks = Vec::new(); + while buffer.len() >= TARGET_AUDIO_CHUNK_BYTES { + chunks.push(buffer.drain(..TARGET_AUDIO_CHUNK_BYTES).collect()); + } + chunks +} + +fn run_task_message(task_id: &str, model: &str, vocabulary_id: Option<&str>) -> String { + let mut parameters = json!({ + "sample_rate": 16000, + "format": "pcm" + }); + if let Some(vocabulary_id) = vocabulary_id.map(str::trim).filter(|id| !id.is_empty()) { + parameters["vocabulary_id"] = Value::String(vocabulary_id.to_string()); + } + + json!({ + "header": { + "action": "run-task", + "task_id": task_id, + "streaming": "duplex" + }, + "payload": { + "task_group": "audio", + "task": "asr", + "function": "recognition", + "model": model, + "parameters": parameters, + "input": {} + } + }) + .to_string() +} + +fn finish_task_message(task_id: &str) -> String { + json!({ + "header": { + "action": "finish-task", + "task_id": task_id, + "streaming": "duplex" + }, + "payload": { "input": {} } + }) + .to_string() +} + +async fn send_text(writer: &SharedWriter, text: String) -> Result<(), BailianASRError> { + let mut guard = writer.lock().await; + let Some(ws) = guard.as_mut() else { + return Err(BailianASRError::ConnectionFailed( + "websocket writer not available".to_string(), + )); + }; + ws.send(Message::Text(text)) + .await + .map_err(|e| BailianASRError::SendFailed(e.to_string())) +} + +async fn send_binary(writer: &SharedWriter, data: Vec) -> Result<(), BailianASRError> { + let mut guard = writer.lock().await; + let Some(ws) = guard.as_mut() else { + return Err(BailianASRError::ConnectionFailed( + "websocket writer not available".to_string(), + )); + }; + ws.send(Message::Binary(data)) + .await + .map_err(|e| BailianASRError::SendFailed(e.to_string())) +} + +async fn close_writer(writer: &SharedWriter) -> Result<(), BailianASRError> { + let mut guard = writer.lock().await; + if let Some(mut ws) = guard.take() { + let _ = ws.close().await; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn credentials_apply_default_endpoint_and_model() { + let creds = BailianCredentials { + api_key: "sk-test".to_string(), + endpoint: "".to_string(), + model: "".to_string(), + vocabulary_id: None, + }; + assert_eq!(creds.normalized_endpoint(), DEFAULT_ENDPOINT); + assert_eq!(creds.normalized_model(), DEFAULT_MODEL); + } + + #[test] + fn run_task_message_uses_pcm_16k() { + let value: Value = + serde_json::from_str(&run_task_message("abc", DEFAULT_MODEL, None)).unwrap(); + assert_eq!(value["header"]["action"], "run-task"); + assert_eq!(value["payload"]["model"], DEFAULT_MODEL); + assert_eq!(value["payload"]["parameters"]["sample_rate"], 16000); + assert_eq!(value["payload"]["parameters"]["format"], "pcm"); + assert!(value["payload"]["parameters"]["vocabulary_id"].is_null()); + } + + #[test] + fn run_task_message_includes_optional_vocabulary_id() { + let value: Value = + serde_json::from_str(&run_task_message("abc", DEFAULT_MODEL, Some(" vocab-123 "))) + .unwrap(); + assert_eq!(value["payload"]["parameters"]["vocabulary_id"], "vocab-123"); + } + + #[test] + fn drain_audio_chunks_keeps_tail_buffered() { + let mut buffer = vec![1u8; TARGET_AUDIO_CHUNK_BYTES * 2 + 17]; + let chunks = drain_audio_chunks(&mut buffer); + assert_eq!(chunks.len(), 2); + assert_eq!(chunks[0].len(), TARGET_AUDIO_CHUNK_BYTES); + assert_eq!(chunks[1].len(), TARGET_AUDIO_CHUNK_BYTES); + assert_eq!(buffer.len(), 17); + } +} diff --git a/openless-all/app/src-tauri/src/asr/mod.rs b/openless-all/app/src-tauri/src/asr/mod.rs index 203cdea9..4433556d 100644 --- a/openless-all/app/src-tauri/src/asr/mod.rs +++ b/openless-all/app/src-tauri/src/asr/mod.rs @@ -5,12 +5,14 @@ //! `frame.rs` (binary frame codec) and the session lifecycle in //! `volcengine.rs`. +pub mod bailian; mod frame; pub mod local; pub mod volcengine; pub mod wav; pub mod whisper; +pub use bailian::{BailianCredentials, BailianRealtimeASR}; pub use volcengine::{VolcengineCredentials, VolcengineStreamingASR}; pub use whisper::WhisperBatchASR; diff --git a/openless-all/app/src-tauri/src/commands.rs b/openless-all/app/src-tauri/src/commands.rs index 723d9ea6..9a79386f 100644 --- a/openless-all/app/src-tauri/src/commands.rs +++ b/openless-all/app/src-tauri/src/commands.rs @@ -381,6 +381,9 @@ fn asr_configured_for_provider(provider: &str, snap: &CredentialsSnapshot) -> bo // 本地 ASR 不依赖云端凭据。 return true; } + if provider == crate::asr::bailian::PROVIDER_ID { + return configured(&snap.asr_api_key); + } configured(&snap.asr_endpoint) && configured(&snap.asr_model) } @@ -539,6 +542,11 @@ pub async fn validate_provider_credentials(kind: String) -> Result Result { + if kind == "asr" && CredentialsVault::get_active_asr() == crate::asr::bailian::PROVIDER_ID { + return Ok(ProviderModelsResult { + models: vec![crate::asr::bailian::DEFAULT_MODEL.to_string()], + }); + } let config = read_openai_provider_config(&kind)?; fetch_provider_models(&config) .await @@ -619,6 +627,10 @@ async fn validate_asr_provider() -> Result<(), String> { return Ok(()); } + if active_asr == crate::asr::bailian::PROVIDER_ID { + return validate_bailian_asr_provider().await; + } + let config = read_openai_provider_config("asr")?; let model = CredentialsVault::get(CredentialAccount::AsrModel) .map_err(|e| e.to_string())? @@ -627,6 +639,44 @@ async fn validate_asr_provider() -> Result<(), String> { validate_asr_transcription(&config, model.trim()).await } +async fn validate_bailian_asr_provider() -> Result<(), String> { + let api_key = CredentialsVault::get(CredentialAccount::AsrApiKey) + .map_err(|e| e.to_string())? + .unwrap_or_default(); + if api_key.trim().is_empty() { + return Err("API Key 为空".to_string()); + } + let endpoint = CredentialsVault::get(CredentialAccount::AsrEndpoint) + .map_err(|e| e.to_string())? + .filter(|s| !s.trim().is_empty()) + .unwrap_or_else(|| crate::asr::bailian::DEFAULT_ENDPOINT.to_string()); + let model = CredentialsVault::get(CredentialAccount::AsrModel) + .map_err(|e| e.to_string())? + .filter(|s| !s.trim().is_empty()) + .unwrap_or_else(|| crate::asr::bailian::DEFAULT_MODEL.to_string()); + let vocabulary_id = CredentialsVault::get(CredentialAccount::AsrVocabularyId) + .map_err(|e| e.to_string())? + .filter(|s| !s.trim().is_empty()); + let asr = std::sync::Arc::new(crate::asr::BailianRealtimeASR::new( + crate::asr::BailianCredentials { + api_key, + endpoint, + model, + vocabulary_id, + }, + )); + asr.open_session().await.map_err(|e| e.to_string())?; + crate::asr::AudioConsumer::consume_pcm_chunk( + &*asr, + &vec![0u8; crate::asr::bailian::TARGET_AUDIO_CHUNK_BYTES], + ); + asr.send_last_frame().await.map_err(|e| e.to_string())?; + asr.await_final_result() + .await + .map(|_| ()) + .map_err(|e| e.to_string()) +} + fn active_asr_is_keyless_for_validation(provider: &str) -> bool { provider == crate::asr::local::PROVIDER_ID || active_foundry_asr_is_supported(provider) } @@ -821,6 +871,7 @@ fn parse_account(s: &str) -> Result { "asr.api_key" => Ok(CredentialAccount::AsrApiKey), "asr.endpoint" => Ok(CredentialAccount::AsrEndpoint), "asr.model" => Ok(CredentialAccount::AsrModel), + "asr.vocabulary_id" => Ok(CredentialAccount::AsrVocabularyId), _ => Err(format!("unknown account: {s}")), } } @@ -1784,6 +1835,10 @@ mod tests { ..snapshot() }; assert!(!asr_configured_for_provider("whisper", &whisper_key_only)); + assert!(asr_configured_for_provider( + crate::asr::bailian::PROVIDER_ID, + &whisper_key_only + )); let whisper_keyless_ready = CredentialsSnapshot { asr_endpoint: Some("https://api.openai.com/v1".into()), @@ -1794,6 +1849,10 @@ mod tests { "whisper", &whisper_keyless_ready )); + assert!(!asr_configured_for_provider( + crate::asr::bailian::PROVIDER_ID, + &whisper_keyless_ready + )); assert!(asr_configured_for_provider( crate::asr::local::PROVIDER_ID, diff --git a/openless-all/app/src-tauri/src/coordinator.rs b/openless-all/app/src-tauri/src/coordinator.rs index 5eb88e0c..801bbfae 100644 --- a/openless-all/app/src-tauri/src/coordinator.rs +++ b/openless-all/app/src-tauri/src/coordinator.rs @@ -19,8 +19,8 @@ use uuid::Uuid; #[cfg(target_os = "windows")] use crate::asr::local::{foundry, FoundryLocalRuntime, FoundryLocalWhisperAsr}; use crate::asr::{ - DictionaryHotword, RawTranscript, VolcengineCredentials, VolcengineStreamingASR, - WhisperBatchASR, + BailianCredentials, BailianRealtimeASR, DictionaryHotword, RawTranscript, + VolcengineCredentials, VolcengineStreamingASR, WhisperBatchASR, }; use crate::combo_hotkey::{ComboHotkeyError, ComboHotkeyEvent, ComboHotkeyMonitor}; use crate::coordinator_state::{ @@ -69,6 +69,7 @@ use resources::{ enum ActiveAsr { Volcengine(Arc), Whisper(Arc), + Bailian(Arc), #[cfg(target_os = "windows")] FoundryLocalWhisper(Arc), /// 本地 Qwen3-ASR;只在 macOS + 模型已下载时可达。 @@ -1785,7 +1786,7 @@ fn ensure_asr_credentials() -> Result<(), String> { return Ok(()); } - if is_whisper_compatible_provider(&active_asr) { + if is_whisper_compatible_provider(&active_asr) || is_bailian_provider(&active_asr) { let api_key = CredentialsVault::get(CredentialAccount::AsrApiKey) .ok() .flatten() @@ -1918,6 +1919,10 @@ fn is_whisper_compatible_provider(id: &str) -> bool { matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq") } +fn is_bailian_provider(id: &str) -> bool { + id == crate::asr::bailian::PROVIDER_ID +} + fn apply_chinese_script_preference(text: &str, pref: ChineseScriptPreference) -> String { if text.is_empty() { return String::new(); @@ -2100,6 +2105,33 @@ fn read_whisper_credentials() -> (String, String, String) { (api_key, base_url, model) } +fn read_bailian_credentials() -> BailianCredentials { + let api_key = CredentialsVault::get(CredentialAccount::AsrApiKey) + .ok() + .flatten() + .unwrap_or_default(); + let endpoint = CredentialsVault::get(CredentialAccount::AsrEndpoint) + .ok() + .flatten() + .filter(|s| !s.trim().is_empty()) + .unwrap_or_else(|| crate::asr::bailian::DEFAULT_ENDPOINT.to_string()); + let model = CredentialsVault::get(CredentialAccount::AsrModel) + .ok() + .flatten() + .filter(|s| !s.trim().is_empty()) + .unwrap_or_else(|| crate::asr::bailian::DEFAULT_MODEL.to_string()); + let vocabulary_id = CredentialsVault::get(CredentialAccount::AsrVocabularyId) + .ok() + .flatten() + .filter(|s| !s.trim().is_empty()); + BailianCredentials { + api_key, + endpoint, + model, + vocabulary_id, + } +} + fn read_volc_credentials() -> VolcengineCredentials { let app_id = CredentialsVault::get(CredentialAccount::VolcengineAppKey) .ok() diff --git a/openless-all/app/src-tauri/src/coordinator/dictation.rs b/openless-all/app/src-tauri/src/coordinator/dictation.rs index a45a0c5f..b250410b 100644 --- a/openless-all/app/src-tauri/src/coordinator/dictation.rs +++ b/openless-all/app/src-tauri/src/coordinator/dictation.rs @@ -216,7 +216,79 @@ pub(super) async fn begin_session(inner: &Arc) -> Result<(), String> { return Ok(()); } - if is_whisper_compatible_provider(&active_asr) { + if is_bailian_provider(&active_asr) { + let asr = Arc::new(BailianRealtimeASR::new(read_bailian_credentials())); + let bridge = Arc::new(DeferredAsrBridge::new()); + let consumer: Arc = bridge.clone(); + store_asr_for_session( + inner, + current_session_id, + ActiveAsr::Bailian(Arc::clone(&asr)), + ); + start_recorder_for_starting(inner, current_session_id, &active_asr, consumer)?; + + if let Err(e) = asr.open_session().await { + log::error!("[coord] open Bailian ASR session failed: {e}"); + match startup_race_status_for_starting(inner, current_session_id) { + StartupRaceStatus::StaleContinuation => { + log::info!( + "[coord] stale Bailian ASR open_session error from session {current_session_id} — ignoring" + ); + asr.cancel(); + discard_startup_resources_for_session(inner, current_session_id); + restore_prepared_windows_ime_session(inner, current_session_id); + return Ok(()); + } + StartupRaceStatus::CancelRaced => { + asr.cancel(); + discard_startup_resources_for_session(inner, current_session_id); + restore_prepared_windows_ime_session(inner, current_session_id); + set_phase_idle_if_session_matches(inner, current_session_id); + return Ok(()); + } + StartupRaceStatus::ActiveStarting => { + asr.cancel(); + } + } + discard_startup_resources_for_session(inner, current_session_id); + emit_capsule( + inner, + CapsuleState::Error, + 0.0, + 0, + Some(format!("ASR 连接失败: {e}")), + None, + ); + restore_prepared_windows_ime_session(inner, current_session_id); + set_phase_idle_if_session_matches(inner, current_session_id); + schedule_capsule_idle(inner, CAPSULE_AUTO_HIDE_DELAY_MS); + return Err(e.to_string()); + } + match startup_race_status_for_starting(inner, current_session_id) { + StartupRaceStatus::ActiveStarting => {} + StartupRaceStatus::CancelRaced => { + log::info!("[coord] cancel raced during Bailian ASR open_session — aborting begin"); + asr.cancel(); + discard_startup_resources_for_session(inner, current_session_id); + restore_prepared_windows_ime_session(inner, current_session_id); + set_phase_idle_if_session_matches(inner, current_session_id); + return Ok(()); + } + StartupRaceStatus::StaleContinuation => { + log::info!( + "[coord] stale Bailian ASR open_session continuation from session {current_session_id} — ignoring" + ); + asr.cancel(); + discard_startup_resources_for_session(inner, current_session_id); + restore_prepared_windows_ime_session(inner, current_session_id); + return Ok(()); + } + } + let target: Arc = asr; + let flushed_bytes = bridge.attach(target); + log::info!("[coord] Bailian ASR connected; flushed {flushed_bytes} deferred audio bytes"); + finish_starting_session(inner, current_session_id).await; + } else if is_whisper_compatible_provider(&active_asr) { let (api_key, base_url, model) = read_whisper_credentials(); // 用户辞書の有効フレーズを Whisper の `prompt` に流し込む。固有名詞や // 専門用語の同音・近形誤認識を ASR 段階で抑える。Polish LLM 側には @@ -627,6 +699,50 @@ pub(super) async fn end_session(inner: &Arc) -> Result<(), String> { } } } + ActiveAsr::Bailian(asr) => { + debug_assert!(uses_global_timeout); + if let Err(e) = asr.send_last_frame().await { + log::error!("[coord] Bailian send last frame failed: {e}"); + } + let timeout_duration = std::time::Duration::from_secs(COORDINATOR_GLOBAL_TIMEOUT_SECS); + match tokio::time::timeout(timeout_duration, asr.await_final_result()).await { + Ok(Ok(r)) => r, + Ok(Err(e)) => { + log::error!("[coord] Bailian await final failed: {e}"); + emit_capsule( + inner, + CapsuleState::Error, + 0.0, + elapsed, + Some(format!("识别失败: {e}")), + None, + ); + restore_prepared_windows_ime_session(inner, current_session_id); + inner.state.lock().phase = SessionPhase::Idle; + schedule_capsule_idle(inner, CAPSULE_AUTO_HIDE_DELAY_MS); + return Err(e.to_string()); + } + Err(_) => { + log::error!( + "[coord] Bailian 全局超时 {} 秒", + COORDINATOR_GLOBAL_TIMEOUT_SECS + ); + asr.cancel(); + emit_capsule( + inner, + CapsuleState::Error, + 0.0, + elapsed, + Some("识别超时".to_string()), + None, + ); + restore_prepared_windows_ime_session(inner, current_session_id); + inner.state.lock().phase = SessionPhase::Idle; + schedule_capsule_idle(inner, CAPSULE_AUTO_HIDE_DELAY_MS); + return Err("bailian global timeout".to_string()); + } + } + } #[cfg(target_os = "windows")] ActiveAsr::FoundryLocalWhisper(local) => { debug_assert!(!uses_global_timeout); diff --git a/openless-all/app/src-tauri/src/coordinator/resources.rs b/openless-all/app/src-tauri/src/coordinator/resources.rs index e04bcd3d..14d60062 100644 --- a/openless-all/app/src-tauri/src/coordinator/resources.rs +++ b/openless-all/app/src-tauri/src/coordinator/resources.rs @@ -67,6 +67,7 @@ pub(super) fn cancel_active_asr(asr: ActiveAsr) { match asr { ActiveAsr::Volcengine(v) => v.cancel(), ActiveAsr::Whisper(w) => w.cancel(), + ActiveAsr::Bailian(b) => b.cancel(), #[cfg(target_os = "windows")] ActiveAsr::FoundryLocalWhisper(local) => local.cancel(), #[cfg(target_os = "macos")] diff --git a/openless-all/app/src-tauri/src/persistence.rs b/openless-all/app/src-tauri/src/persistence.rs index a70362e5..d2175dd4 100644 --- a/openless-all/app/src-tauri/src/persistence.rs +++ b/openless-all/app/src-tauri/src/persistence.rs @@ -166,7 +166,7 @@ fn read_or_default Deserialize<'de> + Default>(path: &Path) -> Resul // "version": 1, // "active": { "asr": "", "llm": "" }, // "providers": { -// "asr": { "": { "appKey", "accessKey", "resourceId", "apiKey", "baseURL", "model" } }, +// "asr": { "": { "appKey", "accessKey", "resourceId", "apiKey", "baseURL", "model", "vocabularyId" } }, // "llm": { "": { "displayName", "apiKey", "baseURL", "model", "temperature", "extraHeaders" } } // } // } @@ -244,6 +244,8 @@ struct CredsAsrEntry { accessKey: Option, #[serde(skip_serializing_if = "Option::is_none")] resourceId: Option, + #[serde(skip_serializing_if = "Option::is_none")] + vocabularyId: Option, } impl CredsAsrEntry { @@ -254,6 +256,7 @@ impl CredsAsrEntry { && self.appKey.as_deref().unwrap_or("").is_empty() && self.accessKey.as_deref().unwrap_or("").is_empty() && self.resourceId.as_deref().unwrap_or("").is_empty() + && self.vocabularyId.as_deref().unwrap_or("").is_empty() } } @@ -532,7 +535,7 @@ fn migrate_legacy_sources_for_update() -> Result { fn load_credentials() -> CredsRoot { match load_keyring_credentials() { Ok(Some(root)) => { - // 不在这里调 remove_legacy_keyring_credentials() —— 它内部对 9 个 + // 不在这里调 remove_legacy_keyring_credentials() —— 它内部对每个 // 旧 account 各做一次 keyring delete,每次 delete 在 macOS Keychain // 上仍要触发 ACL 检查。第一次成功 load 时 legacy entries 通常已经 // 被 migrate_legacy_sources_for_update 清理过了;这里若再无脑跑, @@ -632,6 +635,7 @@ fn lookup_account(root: &CredsRoot, account: CredentialAccount) -> Option asr.and_then(|e| pick(&e.apiKey)), CredentialAccount::AsrEndpoint => asr.and_then(|e| pick(&e.baseURL)), CredentialAccount::AsrModel => asr.and_then(|e| pick(&e.model)), + CredentialAccount::AsrVocabularyId => asr.and_then(|e| pick(&e.vocabularyId)), } } @@ -676,6 +680,10 @@ fn write_account(root: &mut CredsRoot, account: CredentialAccount, value: Option let entry = root.providers.asr.entry(asr_id).or_default(); entry.model = normalized; } + CredentialAccount::AsrVocabularyId => { + let entry = root.providers.asr.entry(asr_id).or_default(); + entry.vocabularyId = normalized; + } } } @@ -983,6 +991,8 @@ pub enum CredentialAccount { AsrEndpoint, /// Active ASR provider's model name. AsrModel, + /// Active ASR provider's optional hotword vocabulary ID. + AsrVocabularyId, } impl CredentialAccount { @@ -1000,6 +1010,7 @@ impl CredentialAccount { CredentialAccount::AsrApiKey => "asr.api_key", CredentialAccount::AsrEndpoint => "asr.endpoint", CredentialAccount::AsrModel => "asr.model", + CredentialAccount::AsrVocabularyId => "asr.vocabulary_id", } } @@ -1014,6 +1025,7 @@ impl CredentialAccount { CredentialAccount::AsrApiKey, CredentialAccount::AsrEndpoint, CredentialAccount::AsrModel, + CredentialAccount::AsrVocabularyId, ] } } diff --git a/openless-all/app/src/i18n/en.ts b/openless-all/app/src/i18n/en.ts index 17feb523..084df369 100644 --- a/openless-all/app/src/i18n/en.ts +++ b/openless-all/app/src/i18n/en.ts @@ -353,6 +353,7 @@ export const en: typeof zhCN = { codingPlanX: 'CodingPlanX', custom: 'Custom', asrVolcengine: 'Volcengine bigasr', + asrBailian: 'Alibaba Bailian realtime ASR', asrSiliconflow: 'SiliconFlow SenseVoice', asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', @@ -378,6 +379,8 @@ export const en: typeof zhCN = { apiKeyLabel: 'API Key', baseUrlLabel: 'Base URL', modelLabel: 'Model', + bailianVocabularyIdLabel: 'Hotword Vocabulary ID (optional)', + bailianVocabularyIdNote: 'If you have created a DashScope hotword vocabulary, enter its vocab-... ID. Leave blank to skip hotwords.', appIdLabel: 'App ID', accessKeyLabel: 'Access Key', resourceIdLabel: 'Resource ID', diff --git a/openless-all/app/src/i18n/ja.ts b/openless-all/app/src/i18n/ja.ts index b5b3ec28..e3e8684a 100644 --- a/openless-all/app/src/i18n/ja.ts +++ b/openless-all/app/src/i18n/ja.ts @@ -355,6 +355,7 @@ export const ja: typeof zhCN = { codingPlanX: 'CodingPlanX', custom: 'カスタム', asrVolcengine: 'Volcengine bigasr', + asrBailian: 'Alibaba Bailian リアルタイム ASR', asrSiliconflow: 'SiliconFlow SenseVoice', asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', @@ -380,6 +381,8 @@ export const ja: typeof zhCN = { apiKeyLabel: 'API キー', baseUrlLabel: 'エンドポイント', modelLabel: 'モデル', + bailianVocabularyIdLabel: 'ホットワード Vocabulary ID(任意)', + bailianVocabularyIdNote: 'DashScope でホットワード辞書を作成済みの場合は vocab-... ID を入力します。空欄なら送信しません。', appIdLabel: 'App ID(アプリケーション ID)', accessKeyLabel: 'Access Key', resourceIdLabel: 'Resource ID', diff --git a/openless-all/app/src/i18n/ko.ts b/openless-all/app/src/i18n/ko.ts index 9518e2af..086f5f1c 100644 --- a/openless-all/app/src/i18n/ko.ts +++ b/openless-all/app/src/i18n/ko.ts @@ -355,6 +355,7 @@ export const ko: typeof zhCN = { codingPlanX: 'CodingPlanX', custom: '사용자 정의', asrVolcengine: 'Volcengine bigasr', + asrBailian: 'Alibaba Bailian 실시간 ASR', asrSiliconflow: 'SiliconFlow SenseVoice', asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', @@ -380,6 +381,8 @@ export const ko: typeof zhCN = { apiKeyLabel: 'API 키', baseUrlLabel: '엔드포인트', modelLabel: '모델', + bailianVocabularyIdLabel: '핫워드 Vocabulary ID(선택)', + bailianVocabularyIdNote: 'DashScope에서 핫워드 사전을 만들었다면 vocab-... ID를 입력하세요. 비워 두면 핫워드를 전송하지 않습니다.', appIdLabel: 'App ID(애플리케이션 ID)', accessKeyLabel: 'Access Key', resourceIdLabel: 'Resource ID', diff --git a/openless-all/app/src/i18n/zh-CN.ts b/openless-all/app/src/i18n/zh-CN.ts index 265449a2..c025274d 100644 --- a/openless-all/app/src/i18n/zh-CN.ts +++ b/openless-all/app/src/i18n/zh-CN.ts @@ -351,6 +351,7 @@ export const zhCN = { codingPlanX: 'CodingPlanX', custom: '自定义', asrVolcengine: '火山引擎 bigasr', + asrBailian: '阿里云百炼实时 ASR', asrSiliconflow: '硅基流动 SenseVoice', asrZhipu: '智谱 GLM-ASR', asrGroq: 'Groq Whisper-large-v3', @@ -376,6 +377,8 @@ export const zhCN = { apiKeyLabel: 'API 密钥', baseUrlLabel: '接口地址', modelLabel: '模型', + bailianVocabularyIdLabel: '热词 Vocabulary ID(可选)', + bailianVocabularyIdNote: '如已在百炼创建热词表,可填写 vocab-...;留空则不下发热词。', appIdLabel: 'App ID(应用 ID)', accessKeyLabel: 'Access Key', resourceIdLabel: '资源 ID', diff --git a/openless-all/app/src/i18n/zh-TW.ts b/openless-all/app/src/i18n/zh-TW.ts index 21dc2d80..4ab3b5a7 100644 --- a/openless-all/app/src/i18n/zh-TW.ts +++ b/openless-all/app/src/i18n/zh-TW.ts @@ -353,6 +353,7 @@ export const zhTW: typeof zhCN = { codingPlanX: 'CodingPlanX', custom: '自定義', asrVolcengine: '火山引擎 bigasr', + asrBailian: '阿里雲百煉即時 ASR', asrSiliconflow: '硅基流動 SenseVoice', asrZhipu: '智譜 GLM-ASR', asrGroq: 'Groq Whisper-large-v3', @@ -378,6 +379,8 @@ export const zhTW: typeof zhCN = { apiKeyLabel: 'API 密鑰', baseUrlLabel: '接口地址', modelLabel: '模型', + bailianVocabularyIdLabel: '熱詞 Vocabulary ID(可選)', + bailianVocabularyIdNote: '如已在百煉建立熱詞表,可填寫 vocab-...;留空則不下發熱詞。', appIdLabel: 'App ID(應用 ID)', accessKeyLabel: 'Access Key', resourceIdLabel: '資源 ID', diff --git a/openless-all/app/src/pages/Overview.tsx b/openless-all/app/src/pages/Overview.tsx index e80427ff..8316bb68 100644 --- a/openless-all/app/src/pages/Overview.tsx +++ b/openless-all/app/src/pages/Overview.tsx @@ -25,6 +25,7 @@ interface OverviewProps { const ASR_NAME_KEY_BY_ID: Record = { volcengine: 'asrVolcengine', + bailian: 'asrBailian', siliconflow: 'asrSiliconflow', zhipu: 'asrZhipu', groq: 'asrGroq', diff --git a/openless-all/app/src/pages/Settings.tsx b/openless-all/app/src/pages/Settings.tsx index 60b9ede3..565578f8 100644 --- a/openless-all/app/src/pages/Settings.tsx +++ b/openless-all/app/src/pages/Settings.tsx @@ -1182,13 +1182,15 @@ type LlmPresetId = typeof LLM_PRESETS[number]['id']; const ASR_DEFAULT_RESOURCE_ID = 'volc.seedasr.sauc.duration'; -// `volcengine` 走自建流式客户端;其余走 OpenAI 兼容 `/audio/transcriptions` -// (`coordinator.rs::is_whisper_compatible_provider`)。新增兼容厂商: +// `volcengine` / `bailian` 走自建流式客户端;其余走 OpenAI 兼容 +// `/audio/transcriptions`(`coordinator.rs::is_whisper_compatible_provider`)。 +// 新增兼容厂商: // 1. 在这里加一项 `{ id, nameKey, baseUrl, model }`; // 2. `coordinator.rs::is_whisper_compatible_provider` 加同名 id; // 3. 在 i18n 的 `settings.providers.presets.` 加文案。 const ASR_PRESETS = [ { id: 'volcengine', nameKey: 'asrVolcengine', baseUrl: '', model: '' }, + { id: 'bailian', nameKey: 'asrBailian', baseUrl: 'wss://dashscope.aliyuncs.com/api-ws/v1/inference/', model: 'fun-asr-realtime' }, { id: 'siliconflow', nameKey: 'asrSiliconflow', baseUrl: 'https://api.siliconflow.cn/v1', model: 'FunAudioLLM/SenseVoiceSmall' }, { id: 'zhipu', nameKey: 'asrZhipu', baseUrl: 'https://open.bigmodel.cn/api/paas/v4', model: 'glm-asr-2512' }, { id: 'groq', nameKey: 'asrGroq', baseUrl: 'https://api.groq.com/openai/v1', model: 'whisper-large-v3-turbo' }, @@ -1393,6 +1395,20 @@ function ProvidersSection() { defaultValue={asrPreset?.baseUrl || undefined} /> + {committedAsrProvider === 'bailian' && ( + <> + +
+ {t('settings.providers.bailianVocabularyIdNote')} +
+ + )} setAsrModelRevision(v => v + 1)} /> )}