In [None]:
import os
import sys
from pathlib import Path

# make sure the project modules can be found
src_path = Path(os.getcwd()).parent.parent.absolute()
sys.path.append(str(src_path))

In [None]:
import io
from kfe.persistence.file_metadata_repository import FileMetadataRepository
from kfe.persistence.model import FileMetadata
from kfe.dependencies import app_db
from kfe.persistence.db import Database
from kfe.persistence.directory_repository import DirectoryRepository
from kfe.features.transcriber import Transcriber

In [None]:
# directory name that you registered in app
directory_name = 'TODO'

# texts on which finetuning should be performed
files_with_manually_fixed_transcripts: list[FileMetadata] = []

await app_db.init_db()
async with app_db.session() as session:
    root_dir = (await DirectoryRepository(session).get_by_name(directory_name)).path

# optionally ignore files with insufficient quality (data for finetuning must be correct)
skip_files = set([
    'output2.mp4',
    'output_video.mp4',
    '331993281_6075996049127954_7515833594348442525_n.mp4',
    '334852651_6018118081603092_8756442741067117699_n.mp4',
    '342090950_6790032441012002_7450332501589782825_n.mp4',
    '350668782_6515570488505362_8154329955946525513_n.mp4',
    '353470587_6537274892962573_1572881441717822125_n.mp4',
    '355342796_24019529530965742_5532146776141433994_n.mp4'
])

files_db = Database(root_dir, log_sql=False)
await files_db.init_db()
async with files_db.session() as session:
    repo = FileMetadataRepository(session)
    files = await repo.load_all_files()
    for f in files:
        if str(f.name) in skip_files:
            continue
        if f.is_transcript_analyzed and f.transcript is not None and f.transcript != '' and f.is_transcript_fixed:
            files_with_manually_fixed_transcripts.append(f)

In [None]:
print('number of available files for finetuning:', len(files_with_manually_fixed_transcripts))

In [None]:
preprocessed_data_dir = Path('./speech_finetuning_data')
os.mkdir(preprocessed_data_dir)
print(f'finetuning audio files will be saved at: {preprocessed_data_dir.absolute()}')

In [None]:
model_sampling_rate = 16000 # sampling rate on which model was trained, in case of models used in app it's 16kHz
num_files = 0

transcriber = Transcriber(None)
with open(preprocessed_data_dir.joinpath('transcriptions.txt'), 'w') as f:
    for file in files_with_manually_fixed_transcripts:
        path = root_dir.joinpath(file.name)
        try:
            parts = []
            async for part in transcriber._get_preprocessed_audio_file(path, sampling_rate=model_sampling_rate):
                parts.append(part)
            # skip longer files since we don't know how to split transcription text
            if len(parts) > 1:
                continue  
            audio_data: io.BytesIO = parts[0]
            target_path = preprocessed_data_dir.joinpath(file.name + '.wav')
            with open(target_path, 'wb') as target:
                audio_data.seek(0)
                target.write(audio_data.getvalue())
            transcription = str(file.transcript).replace("\n", " ")
            f.write(f'{file.name}.wav: {transcription}' + '\n')
            num_files += 1
        except Exception as e:
            print(e)

print('number of files that will be used for finetuning:', num_files)