In [1]:
import os
import math
import librosa
import numpy as np
import tensorflow.keras as keras
from pvrecorder import PvRecorder
import struct
import wave
from pydub import AudioSegment
from pydub.silence import split_on_silence

In [2]:
AudioSegment.converter = 'ffmpeg.exe'
AudioSegment.ffmpeg = 'ffmpeg.exe'
AudioSegment.ffprobe = 'ffprobe.exe'

In [3]:
word_dict = {0:"lên",1:"phải",2:"trái",3:"xuống"}

In [4]:
def split_audio(wav_file):
    file_path= r'G:\Python\Lenh_dieu_khien\Audio_Mic\Audio_Split'

    sound_file = AudioSegment.from_wav(file_path + wav_file)
    audio_chunks = split_on_silence(sound_file, min_silence_len=100, silence_thresh=-40)
    length = 0

    for i, chunk in enumerate(audio_chunks):
        out_file = file_path + "\chunk{0}.wav".format(i)
        chunk.export(out_file, format="wav")
        length += 1
    
    return int(length)

In [5]:
def edit_chunk(length):
    start_time = 0
    end_time = 1000

    one_sec_segment = AudioSegment.silent(duration=1000)
    for i in range(0, length):
        song = AudioSegment.from_wav('Audio_Split' + '/' + 'chunk' + str(i) + '.wav')
        final_song = song + one_sec_segment
        final_song = final_song[start_time:end_time]
        final_song.export('Audio_Split' + '/' + 'edit_chunk' + str(i) + '.wav')

In [6]:
def process_input(audio_file):
    SAMPLE_RATE = 16000
    NUM_MFCC = 13
    N_FTT=2048
    HOP_LENGTH=512
    TRACK_DURATION = 1 # measured in seconds
    SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION

    samples_per_segment = int(SAMPLES_PER_TRACK)

    signal, sample_rate = librosa.load(audio_file, sr=SAMPLE_RATE)
  
    mfcc = librosa.feature.mfcc(signal, sample_rate, n_mfcc=NUM_MFCC, n_fft=N_FTT, hop_length=HOP_LENGTH)
    mfcc = mfcc.T

    return mfcc

In [7]:
def delete_files_in_directory(directory_path):
  try:
    files = os.listdir(directory_path)
    for file in files:
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
          os.remove(file_path)
    print("All files deleted successfully.")
  except OSError:
    print("Error occurred while deleting files.")

In [8]:
def speech_to_text(length):
    
    stt = ''
    
    for i in range(0, length):
        new_input_mfcc = process_input(r'G:\Python\Lenh_dieu_khien\Audio_Mic\Audio_Split\edit_chunk' + str(i) + '.wav')
    
        X_to_predict = new_input_mfcc[np.newaxis, ..., np.newaxis]
        X_to_predict.shape
    
        prediction = reconstructed_model.predict(X_to_predict)
        predicted_index = np.argmax(prediction, axis=1)

        #print("Predicted:", word_dict[int(predicted_index)])
        
        stt += word_dict[int(predicted_index)] + ' '
    return stt

In [9]:
path = r'G:\Python\Lenh_dieu_khien\model_robot'
reconstructed_model = keras.models.load_model(path + "\Speech_to_Text.tf")

In [None]:
import tkinter as tk
import threading
from denoise import AudioDeNoise

file_path = r'G:\Python\Lenh_dieu_khien\Audio_Mic\Audio_Split'

from socket import socket, AF_INET, SOCK_DGRAM

SERVER_IP   = '192.168.1.102'
PORT_NUMBER = 5000
SIZE = 1024
print ("Test client sending packets to IP {0}, via port {1}\n".format(SERVER_IP, PORT_NUMBER))
mySocket = socket( AF_INET, SOCK_DGRAM )
huong_di_chuyen = None

class VoiceRecorder:
    
    def __init__(self):
        self.root = tk.Tk()
        self.root.resizable(False, False)
        self.button = tk.Button(text = "🎤", font = ("Arial", 120, "bold"), command = self.click_handler)
        self.button.pack()
        self.output = tk.Text(self.root, height = 5, width = 41,)
        self.output.pack()
        self.recording = False
        self.root.mainloop()
    
    def click_handler(self):
        if self.recording:
            self.recording = False
            self.button.config(fg = "black")
        else:
            self.recording = True
            self.button.config(fg = "red")
            threading.Thread(target = self.record).start()
    
    def record(self):
        recorder = PvRecorder(device_index=-1, frame_length=512)
        audio = []
        
        recorder.start()
        
        while self.recording:
            frame = recorder.read()
            audio.extend(frame)
        
        recorder.stop()
        with wave.open(file_path + '\sound.wav', 'w') as f:
            f.setparams((1, 2, 16000, 512, "NONE", "NONE"))
            f.writeframes(struct.pack("h" * len(audio), *audio))

        audioDenoiser = AudioDeNoise(inputFile=file_path + '\sound.wav')
        audioDenoiser.deNoise(outputFile=file_path + '\sound_clear.wav')
        
        length = split_audio('\sound_clear.wav')
        edit_chunk(length)
        stt = speech_to_text(length)
        print(stt)
        if stt in command:
            if stt == 'lên':
                huong_di_chuyen = 'lên'
            elif stt == 'xuống':
                huong_di_chuyen = 'xuống'
            if (stt == 'trái' or stt == 'phải') and huong_di_chuyen != None:
                mySocket.sendto(stt.encode('utf-8'),(SERVER_IP,PORT_NUMBER))
                mySocket.sendto(huong_di_chuyen.encode('utf-8'),(SERVER_IP,PORT_NUMBER))
            else:
                mySocket.sendto(stt.encode('utf-8'),(SERVER_IP,PORT_NUMBER))
            self.output.insert("1.0", stt + '\n')

        self.delete_files_in_directory(file_path)
        recorder.delete()
        
VoiceRecorder()