In [1]:
import os
from datetime import datetime
import time
import pyaudio
import librosa
import asyncio
import threading
import numpy as np
from transformers import AutoProcessor, AutoTokenizer, MarianMTModel, MarianTokenizer, AutoModelForSpeechSeq2Seq, pipeline
import spacy
from spellchecker import SpellChecker

import torch
import torchaudio
import tensorflow

In [2]:
class AudioChanel():
    def __init__(self):
        #change this according to your model
        self.chunk = 1024
        self.format = pyaudio.paInt16
        self.channels = 1#2
        self.rate = 16000
        self.recorder = pyaudio.PyAudio()
        self.frames = []
        self.seconds = 2
        # Открыть поток для записи
        #1 {'index': 1, 'structVersion': 2, 'name': 'Line 1 (Virtual Audio Cable)', 'hostApi': 0, 'maxInputChannels': 2, 
        #   'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 
        #   'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
        self.input_stream = self.recorder.open(format=self.format,
                                          channels=self.channels,
                                          rate=self.rate,
                                          input=True,
                                          frames_per_buffer=self.chunk,
                                          input_device_index=1) #0,1 - work
        # Открыть поток для воспроизведения
        #5 {'index': 5, 'structVersion': 2, 'name': '... (Realtek)', 'hostApi': 0, 'maxInputChannels': 0, 
        #   'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 
        #   'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
        self.output_stream = self.recorder.open(format=self.format,
                                           channels=self.channels,
                                           rate=self.rate,
                                           output=True,
                                           frames_per_buffer=self.chunk,
                                           input_device_index=5) #5- work
        
        
    def record(self):
        while True:
            data = self.input_stream.read(self.chunk)
            self.frames.append(data)

    def play(self, audio = None):
        output_stream = self.recorder.open(format=self.format,
                                           channels=self.channels,
                                           rate=self.rate,
                                           output=True,
                                           frames_per_buffer=self.chunk,
                                           input_device_index=5) #5- work
        match audio:
            case list():
                for frame in audio:
                    output_stream.write(frame)
            case _:
                output_stream.write(audio.tobytes())

    def play_default(self):
        output_stream = self.recorder.open(format=self.format,
                                           channels=self.channels,
                                           rate=self.rate,
                                           output=True,
                                           frames_per_buffer=self.chunk,
                                           input_device_index=5) #5- work
        with threading.Lock():
            to_play = list(self.frames)
        for frame in to_play:
            output_stream.write(frame)
    
    def get(self):
        with threading.Lock():
            return list(self.frames)
        
        
    def close(self):
        self.input_stream.stop_stream()
        self.input_stream.close()

In [3]:
class Agent():
    def __init__(self):
        self.audio = AudioChanel()
        self.device = torch.device('cuda')

        whisper_name = "openai/whisper-large-v3"
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            whisper_name, torch_dtype=torch.float, low_cpu_mem_usage=True, use_safetensors=True)
        self.model.to(self.device)
        self.processor = AutoProcessor.from_pretrained(whisper_name)
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            return_timestamps=True,
            torch_dtype=torch.float,
            device=self.device,
        )
        self.PASS_VAR = False

        model_name = "Helsinki-NLP/opus-mt-en-ru"
        self.tokenizer_translate = MarianTokenizer.from_pretrained(model_name)
        self.model_translate = MarianMTModel.from_pretrained(model_name)
        self.model_translate.to(self.device)
        
        language = 'ru'
        model_id = 'v4_ru'
        self.sample_rate = 48000
        self.speaker = 'kseniya'
        
        self.model_voice, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                             model='silero_tts',
                                             language=language,
                                             speaker=model_id)
        self.model_voice.to(self.device)
        self.to_play = []

    def send_to_play(self):
        while True:
            time.sleep(1)
            with threading.Lock():
                copy_to_play = list(self.to_play)
            while len(copy_to_play):
                to_play = copy_to_play[0]
                translation = threading.Thread(target=self.audio.play,args=(to_play[0],))
                original = threading.Thread(target=self.audio.play,args=(to_play[1],))
                
                translation.start()
                original.start()
                
                translation.join()
                original.join()
                with threading.Lock():
                    del self.to_play[0]
                    del copy_to_play[0]
    
    async def run(self):
        #start recording
        _ = threading.Thread(target=self.audio.record).start()
        _ = threading.Thread(target=self.send_to_play).start()
        
        await asyncio.sleep(5)
        last_state = ''
        while True:
            '''
            PART 1
            RECORDING
            description: record every 2 seconds of audio from PC
            '''
            # await asyncio.sleep(2) for very fast pc
            time = datetime.now()
            frames = self.audio.get()[:80]##200 - default, 100 - boost speed
            to_model = np.frombuffer(b''.join(frames), dtype=np.int16)
            end_1 = datetime.now()
            if not len(frames):
                continue
            print('end_1: ', end_1-time)
            '''
            PART 2
            CONVERTING AUDIO TO FULL TEXT TEXT
            description: put audio into model and get recognized words
            '''
            whisper_result = self.pipe(to_model)['chunks']#  'chunks': [{'timestamp': (0.0, 19.94), 'text': ' Undertexter av Nicolai Winther'}]}
            if last_state == '':
                last_state = whisper_result[0]['text']
            else:
                if last_state == whisper_result[0]['text']:
                    self.PASS_VAR = True
                last_state = whisper_result[0]['text']
            print(whisper_result)
            end_2 = datetime.now()
            print('end_2: ', end_2-end_1)
            '''
            PART 3
            TRANSLATE SENTENCE INTO RUSSIAN (or any other language)
            description: translate one of the list, because we should be sure about full context
            '''
            if len(whisper_result)<=1 and not self.PASS_VAR:
                continue
            self.PASS_VAR = False
            print(whisper_result)
            
            prefix = 'translate to ru: '
            ### delete part of audio.frames list
            with threading.Lock():
                to_del = int(whisper_result[0]['timestamp'][1]*self.audio.rate/self.audio.chunk)
                play_background = self.audio.frames[:to_del]#to_del+1
                del self.audio.frames[:to_del]
            ###
            src_text = prefix + whisper_result[0]['text']
            
            # translate Englosh to Russian
            input_ids = self.tokenizer_translate(src_text, return_tensors="pt")
            
            generated_tokens = self.model_translate.generate(**input_ids.to(self.device))
            
            result = self.tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)
            end_3 = datetime.now()
            print('end_3: ', end_3-end_2)
            '''
            PART 4
            VOICEOVER OF TEXT AND PLAY
            description: voice translated text
            '''
            print(result[0])
            text_to_translate = str(result[0])
            while 'Перевод на ru:' in text_to_translate:
                text_to_translate.replace('Перевод на ru: ','')
            if not len(text_to_translate):
                continue

            #errors just happens sometimes, don't know why
            try:
                audio = self.model_voice.apply_tts(text=text_to_translate,
                            speaker=self.speaker,
                            sample_rate=self.sample_rate)
            except:
                continue
            
            audio_numpy = audio.cpu().numpy()
            audio_resampled = librosa.resample(audio_numpy, orig_sr=self.sample_rate, 
                                               target_sr=self.audio.rate, res_typestr='fft')
            audio_int16 = (audio_resampled * 32767).astype(np.int16)
            end_4 = datetime.now()
            print('end_4: ', end_4-end_3)
            # self.audio.play(audio_int16)
            with threading.Lock():
                self.to_play.append((audio_int16,frames[:to_del]))
            # _ = threading.Thread(target=self.audio.play, args=(play_background,)).start()
            
a = Agent()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using cache found in C:\Users\User/.cache\torch\hub\snakers4_silero-models_master


In [None]:
await a.run()

end_1:  0:00:00


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


[{'timestamp': (0.0, 2.26), 'text': ' We have to do, you know, what are you looking for?'}, {'timestamp': (2.58, 5.12), 'text': ' And being the Socratic professor that I am, well, what?'}]
end_2:  0:00:41.589966
[{'timestamp': (0.0, 2.26), 'text': ' We have to do, you know, what are you looking for?'}, {'timestamp': (2.58, 5.12), 'text': ' And being the Socratic professor that I am, well, what?'}]
end_3:  0:00:01.549622
Переведи на ru: Мы должны сделать, знаешь, что ты ищешь?
end_4:  0:00:08.154352
end_1:  0:00:00
[{'timestamp': (0.0, 2.22), 'text': ' and being the Socratic professor that I am,'}, {'timestamp': (2.22, 4.06), 'text': ' I was like, well, what does your son or daughter do?'}, {'timestamp': (4.12, 5.06), 'text': ' What do they do really well?'}]
end_2:  0:00:41.200716
[{'timestamp': (0.0, 2.22), 'text': ' and being the Socratic professor that I am,'}, {'timestamp': (2.22, 4.06), 'text': ' I was like, well, what does your son or daughter do?'}, {'timestamp': (4.12, 5.06), '

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


end_4:  0:00:00.334034
end_1:  0:00:00
[{'timestamp': (0.0, 2.12), 'text': ' but to be quite honest with you'}, {'timestamp': (2.12, 4.28), 'text': " those are the last things I'm looking for"}, {'timestamp': (4.28, 5.14), 'text': ' the most important'}]
end_2:  0:00:33.168819
[{'timestamp': (0.0, 2.12), 'text': ' but to be quite honest with you'}, {'timestamp': (2.12, 4.28), 'text': " those are the last things I'm looking for"}, {'timestamp': (4.28, 5.14), 'text': ' the most important'}]
end_3:  0:00:01.188606
Перевод на ru: но быть с вами честным
end_4:  0:00:00.409326
end_1:  0:00:00.000750
[{'timestamp': (0.0, 2.16), 'text': " Those are the last things I'm looking for."}, {'timestamp': (2.42, 3.46), 'text': ' The most important thing?'}, {'timestamp': (3.46, 5.46), 'text': ' Self-love.'}]
end_2:  0:00:32.834370
[{'timestamp': (0.0, 2.16), 'text': " Those are the last things I'm looking for."}, {'timestamp': (2.42, 3.46), 'text': ' The most important thing?'}, {'timestamp': (3.46, 5