In [1]:
import datetime
import os
import pandas as pd
import sqlalchemy as sqla
import tempfile
import time
from faster_whisper import WhisperModel
from loguru import logger
from pangres import upsert
from pathlib import Path
from plumbum import local
from sqlalchemy import create_engine, Connection, text
from typing import Protocol
# local imports
import faster_whisper_helpers

# Config

In [2]:
# common configuration
LANGUAGE = 'it'
BASE_PATH = Path(LANGUAGE).resolve()
TABLE_NAME = 'transcriptions'
DB_PATH = 'cv.sqlite3'
COMMON_VOICES_VERSION = '17.0'
ENGINE = create_engine(f'sqlite:///{DB_PATH}')
POETRY_CMD = local['poetry']

# tools specific configuration
AUTOSUB_LANGUAGE_ASR = 'it-it'  # autosub needs localized language e.g. `it-it` or `uk-ua`
WHISPER_LANGUAGE_ASR = LANGUAGE
WHISPER_MODEL_NAME = 'large-v3'
# if "cuda" is installed, use the first line instead of the second, it will be **a lot** faster
# WHISPER_MODEL = WhisperModel(model_size_or_path=WHISPER_MODEL_NAME, device="cuda", compute_type="float16")
WHISPER_MODEL = WhisperModel(model_size_or_path=WHISPER_MODEL_NAME,device='cpu', compute_type="float32")
WHISPER_CONDITION_ON_PREVIOUS_TEXT = False

# Transcribers

In [3]:
class Transcriber(Protocol):
    asr_tool: str

    @staticmethod
    def transcribe(audio_path: Path | str) -> str:
        pass


class AutosubTranscriber:
    asr_tool = 'autosub'

    @staticmethod
    def transcribe(audio_path: Path | str) -> str:
    
        temp_path = tempfile.mktemp(suffix='.txt', dir='.')
        # autosub renames the output path provided via parameter `-o`
        # slightly to include the language... -_-"
        output_path_autosub = temp_path[:-4] + f'.{AUTOSUB_LANGUAGE_ASR}.txt'

        try:
            POETRY_CMD['run', 'autosub', '-S', AUTOSUB_LANGUAGE_ASR, '-i', str(audio_path), '-o', temp_path]()
            with open(output_path_autosub, 'r') as fh:
                transcription = fh.read().strip()
        finally:
            if os.path.exists(output_path_autosub):
                os.remove(output_path_autosub)
    
        return transcription


class WhisperTranscriber:
    asr_tool = 'whisper-large-v3'

    @staticmethod
    def transcribe(audio_path: Path | str) -> str:
        return faster_whisper_helpers.WhisperTranscriber(model=WHISPER_MODEL,
                                                         media_filepath=audio_path,
                                                         source_language=WHISPER_LANGUAGE_ASR,
                                                         condition_on_previous_text=False,
                                                         initial_prompt=None,
                                                         task='transcribe').transcribe().to_string()

# Helpers

In [4]:
def save_transcription_record(connection: Connection, asr_tool: str, clip_name: str, sentence_id: str,
                              transcription: str, duration: float | None) -> None:
    transcription_record = {'updated': datetime.datetime.now().astimezone(datetime.timezone.utc),
                            'path': clip_name, 'sentence_id': sentence_id, 'transcription': transcription,
                            'asr_tool': asr_tool, 'cv_version': COMMON_VOICES_VERSION,
                            'duration': duration}
    df_transcription = pd.DataFrame([transcription_record]).set_index(['path', 'asr_tool'])
    upsert(df=df_transcription, if_row_exists='update', con=connection, table_name='transcriptions',
           chunksize=1000, create_table=True)


def clip_already_transcribed(connection: Connection, asr_tool: str, clip_name: str):
    statement = text(f'''
        SELECT EXISTS (
            SELECT 1 FROM {TABLE_NAME}
            WHERE path = :clip_name
            AND asr_tool = :asr_tool
            AND transcription IS NOT NULL -- failed transcriptions are set to NULL
        )
    ''')
    parameters = {'clip_name': clip_name, 'asr_tool': asr_tool}

    # assume table exists (optimistic approach for saving time) and if we get an error,
    # check if it is due to the table not being there
    try:
        return bool(connection.execute(statement=statement, parameters=parameters).scalar())
    except Exception as e:
        table_exists = 'transcriptions' in sqla.inspect(connection).get_table_names()
        if table_exists:
            raise e
        else:
            return False


def transcribe_and_save(audio_path: Path | str, connection: Connection, transcriber: Transcriber,
                        clip_name: str, sentence_id: str) -> None:
    if clip_already_transcribed(connection=connection, asr_tool=transcriber.asr_tool, clip_name=clip_name):
        return

    try:
        start = time.perf_counter()
        transcription = transcriber.transcribe(audio_path=audio_path)
        end = time.perf_counter()
        duration = end - start
    except Exception:
        logger.exception(f'Failed to transcribe {clip_name}, marking transcription as NULL')
        transcription = None
        duration = None

    save_transcription_record(connection=connection, asr_tool=transcriber.asr_tool,
                              clip_name=clip_name, duration=duration,
                              sentence_id=sentence_id, transcription=transcription)
    connection.commit()

# Get table containing sample clip paths

This must have been generated in the previous notebook.

In [5]:
query_clip_paths = text(f"SELECT * FROM samples_{LANGUAGE}")
df_commons_sample = pd.read_sql(sql=query_clip_paths, con=ENGINE, index_col='path')
df_commons_sample['full_path'] = df_commons_sample.index.map(lambda p: (BASE_PATH / "clips") / p)
df_commons_sample.drop(columns=['full_path']).head()

Unnamed: 0_level_0,sentence_id,sentence
path,Unnamed: 1_level_1,Unnamed: 2_level_1
common_voice_it_32671878.mp3,34d62b82cce0334fc156e6754e9320e8b4d6d09152eb5c...,Beth accetta.
common_voice_it_19983792.mp3,0234ccc4d8569b1d653055cb5884924d53b6b13692175f...,Il pezzo ha avuto moltissimo successo nel mondo.
common_voice_it_23989089.mp3,190f431e0c7145d70702b4e7b8e582dbfb8adc5adb5edc...,La camera funebre ha volta piatta con i lati r...
common_voice_it_21262721.mp3,1075be9d8faa8faed772694349f4f432b4593f5f5c70da...,Dopo questo album la band si sciolse.
common_voice_it_20003259.mp3,04468068bfee66d8f8075cc871fc2ed5653485a33d56da...,Egli introdusse un nuovo sistema di scommesse ...


# Transcribe sample clips

In [None]:
nb_samples = len(df_commons_sample)
transcribers = (AutosubTranscriber(), WhisperTranscriber())


# iterate over all selected audio clips from the common voices dataset
with ENGINE.connect() as connection:
    for ix, row in enumerate(df_commons_sample.itertuples()):

        # transcribe and save result for each tool
        for transcriber in transcribers:
            transcribe_and_save(audio_path=row.full_path, connection=connection, transcriber=transcriber,
                                clip_name=row.Index, sentence_id=row.sentence_id)

        # show progress inline
        print(f'{ix + 1}/{nb_samples} done', end='\r')