In [38]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf  # Although imported, it's not directly used in this snippet
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DataCollatorWithPadding
import torch
from datasets import Dataset
import pandas as pd  # Assuming your data is in a pandas DataFrame

In [39]:
DATA_DIR_1 = 'voiceassistant_v1.csv'

# Load dataset
voiceassistant_df = pd.DataFrame(pd.read_csv(DATA_DIR_1))


In [40]:
# Encode labels
le = LabelEncoder()
voiceassistant_df['label'] = le.fit_transform(voiceassistant_df['intent'])

In [41]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize data
dataset = Dataset.from_pandas(voiceassistant_df[["transcription", "label"]])
dataset = dataset.map(lambda x: tokenizer(x["transcription"], truncation=True), batched=True)

Map: 100%|██████████| 16280/16280 [00:02<00:00, 6322.51 examples/s]


In [42]:
# Train/test split
train_test = dataset.train_test_split(test_size=0.2)
train_ds, val_ds = train_test['train'], train_test['test']

In [43]:
# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [45]:
# Training args
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.3708
1000,0.0055
1500,0.0013
2000,0.003
2500,0.0005
3000,0.005
3500,0.0003
4000,0.0049
4500,0.0002


TrainOutput(global_step=4884, training_loss=0.0400867374105887, metrics={'train_runtime': 2982.2047, 'train_samples_per_second': 13.102, 'train_steps_per_second': 1.638, 'total_flos': 153277039754496.0, 'train_loss': 0.0400867374105887, 'epoch': 3.0})

In [46]:
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

('saved_model\\tokenizer_config.json',
 'saved_model\\special_tokens_map.json',
 'saved_model\\vocab.txt',
 'saved_model\\added_tokens.json')

In [47]:
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_intent = le.inverse_transform([predicted_class_id])[0]
    return predicted_intent


In [48]:
new_transcriptions = [
    "G-Assist, cancel trip hok baru masuk tadi.",
    "Buleh check earning bulan ni?",
    "Nak berhenti kejap, isi minyak jap",
    "Jawab penumpang saya akan tiba 5 minit nanti",
    "Saya nak rehat",
    "Okey, ammik trip ni"
]


In [49]:
for transcript in new_transcriptions:
    intent = predict_intent(transcript)
    print(f"{transcript} -> {intent}")

G-Assist, cancel trip hok baru masuk tadi. -> unknown
Buleh check earning bulan ni? -> unknown
Nak berhenti kejap, isi minyak jap -> Stop request
Jawab penumpang saya akan tiba 5 minit nanti -> unknown
Saya nak rehat -> unknown
Okey, ammik trip ni -> unknown


In [50]:
model = DistilBertForSequenceClassification.from_pretrained("saved_model")

In [51]:
import sys
if sys.version_info >= (3, 13):
    import types
    # 创建一个空模块，填充到 sys.modules 中，以防止 httpx 依赖时找不到 cgi 模块
    sys.modules["cgi"] = types.ModuleType("cgi")

import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np
import whisper
import os
import time
import pyttsx3
import requests
import re
import ffmpeg
from googletrans import Translator  # 需要安装 googletrans==4.0.0-rc1

In [52]:
# 设置 Homebrew 的 ffmpeg 路径（macOS 用户）
os.environ["PATH"] = "/opt/homebrew/bin:" + os.environ["PATH"]

In [53]:
# 初始化 TTS 引擎并设置纯英文语音
engine = pyttsx3.init()
engine.setProperty('rate', 150)
engine.setProperty('volume', 1.0)
# 列出所有支持的语音（调试用）
voices = engine.getProperty('voices')
for voice in voices:
    print(voice.id, voice.name, voice.languages)
# 强制使用纯英文语音，如 macOS 上的 "Alex"
engine.setProperty('voice', "com.apple.speech.synthesis.voice.Alex")

HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0 Microsoft David Desktop - English (United States) []
HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0 Microsoft Zira Desktop - English (United States) []
HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_ZH-CN_HUIHUI_11.0 Microsoft Huihui Desktop - Chinese (Simplified) []


In [54]:
# OpenWeather API Key（请替换为你自己的实际 API Key）
OPENWEATHER_API_KEY = "060ade3304c10d0137387e263293f718"

# 支持识别的城市列表（可自行扩展）
SUPPORTED_CITIES = [
    "Kuala Lumpur", "Penang", "Johor Bahru", "Ipoh", "Kuantan", "Melaka", "Seremban",
    "Singapore", "Bangkok", "Jakarta", "Taipei", "Hong Kong", "Shanghai", "Beijing",
    "Tokyo", "New York", "London"
]

In [55]:
# 加载 Whisper 模型
print("Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("Model loaded.")

Loading Whisper model...
Model loaded.


In [56]:
def speak(text):
    print(f"Assistant says: {text}")
    engine.say(text)
    engine.runAndWait()

In [57]:
def record_audio(filename='output.wav', duration=5, sample_rate=44100):
    print("Recording...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float64')
    sd.wait()
    print("Recording finished.")
    audio = np.int16(audio / np.max(np.abs(audio)) * 32767)
    write(filename, sample_rate, audio)
    print(f"Audio saved as {filename}")

In [58]:
def translate_to_english(text):
    try:
        translator = Translator()
        translation = translator.translate(text, dest='en')
        return translation.text
    except Exception as e:
        print("Translation error:", e)
        return text

In [59]:
def transcribe(audio_path):
    print("Transcribing...")
    result = whisper_model.transcribe(audio_path)
    print(f"Detected language: {result['language']}")
    original_text = result['text']
    print("Original transcription:", original_text)
    translated_text = translate_to_english(original_text)
    print("Translated transcription:", translated_text)
    return translated_text

In [60]:
# COMMAND_KEYWORDS = {
#     "navigate": ["navigate", "go to", "directions", "导航", "去", "navigasi", "pergi ke"],
#     "job": ["job", "accept job", "start work", "take order", "接单", "开始", "mula kerja", "terima pesanan"],
#     "music": ["music", "play", "play music", "播放音乐", "音乐", "main muzik"],
#     "stop": ["stop", "exit", "shutdown", "停止", "关掉", "berhenti"],
#     "settings": ["settings", "open settings", "设置", "tetapan"],
#     "weather": ["weather", "what's the weather", "check weather", "天气", "天氣", "cuaca"],
#     "call": ["call", "call someone", "打电话", "hubungi"]
# }

# def interpret_command(text):
#     text = text.lower()
#     for command, keywords in COMMAND_KEYWORDS.items():
#         if any(k in text for k in keywords):
#             return command
#     return "unknown"

In [61]:
def extract_city(text):
    for city in SUPPORTED_CITIES:
        if city.lower() in text.lower():
            return city
    return "Kuala Lumpur"

In [62]:
def get_weather(city="Kuala Lumpur"):
    try:
        print(f"[Weather API] Requesting weather for: {city}")
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={'060ade3304c10d0137387e263293f718'}&units=metric"
        response = requests.get(url)
        data = response.json()
        if response.status_code == 200:
            temp = data['main']['temp']
            desc = data['weather'][0]['description']
            msg = f"The current weather in {city} is {desc} with {temp} degrees Celsius."
            return msg
        elif response.status_code == 401:
            print("[Weather API Error]", data)
            return "Invalid API key for weather service. Please check your API key."
        else:
            print("[Weather API Error]", data)
            return "Sorry, I couldn't fetch the weather right now."
    except Exception as e:
        print("[Weather Exception]", e)
        return "There was an error retrieving the weather."

In [None]:
def respond_to_command(command, full_text):
    if command == "WEATHER CHECK":
        city = extract_city(full_text)
        msg = get_weather(city)
    else:
        RESPONSES = {
            "NAVIGATE CHECK": "Alright, starting the fastest route to your passenger now.",
            "ACCEPT BOOKING": "Great! You've accepted the job. Let's get going.",
            "REJECT BOOKING": "No worries. I've declined the request for you.",
            "TRAFFIC CHECK": "Let me check... Looks like the traffic is smooth ahead.",
            "CHECK EARNING": "You've earned RM100 so far today. Keep it up!",
            "STOP REQUEST": "Alright, ending the session now. Take care!",
            "UNKNOWN": "Sorry, I didn’t catch that. Could you please repeat your command?",
        }
        msg = RESPONSES.get(command, RESPONSES[command])
    print(f"Assistant: {msg}")
    speak(msg)
    return False if command == "STOP REQUEST" else True

In [68]:
def transcribe_and_predict(audio_path):
    print("Transcribing...")
    result = whisper_model.transcribe(audio_path)
    print("Transript:", result['text'])
    # translated_text = translate_to_english(result['text'])
    # print("Translated:", translated_text)

    inputs = tokenizer(result['text'], return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_intent = le.inverse_transform([predicted_class_id])[0]
    # intent = predict_intent(translated_text)
    print("Predicted intent:", predicted_intent)
    return result['text'], predicted_intent

In [69]:
def main_loop():
    print("Multilingual Voice Assistant Started — Say anything in any language!")
    speak("Hello! Please speak your command.")
    while True:
        filename = "output.wav"
        record_audio(filename=filename, duration=5)
        if os.path.exists(filename):
            text, command = transcribe_and_predict(filename)
            if not respond_to_command(command, text):
                break
        else:
            print("Audio file not found.")
        time.sleep(1)

In [71]:
if __name__ == "__main__":
    main_loop()

Multilingual Voice Assistant Started — Say anything in any language!
Assistant says: Hello! Please speak your command.
Recording...
Recording finished.
Audio saved as output.wav
Transcribing...




Transript: cha
Predicted intent: Navigate check
Assistant: Alright, starting the fastest route to your passenger now.
Assistant says: Alright, starting the fastest route to your passenger now.
Recording...
Recording finished.
Audio saved as output.wav
Transcribing...




Transript:  Well it rained later.
Predicted intent: unknown
Assistant: Sorry, I didn’t catch that. Could you please repeat your command?
Assistant says: Sorry, I didn’t catch that. Could you please repeat your command?
Recording...
Recording finished.
Audio saved as output.wav
Transcribing...




Transript:  🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷 🤷
Predicted intent: Stop request
Assistant: Alright, ending the session now. Take care!
Assistant says: Alright, ending the session now. Take care!
