In [None]:
# List all audio files
import glob

audio_files = glob.glob("./data/audio_files/**/*.wav", recursive=True)
print(audio_files)

In [None]:
# Read in excel file with annotations
import pandas as pd

filepath = './data/Recordings.xlsx'
df_R1 = pd.read_excel(filepath, sheet_name=0)
df_R1_L1 = pd.read_excel(filepath, sheet_name=1)
df_R5 = pd.read_excel(filepath, sheet_name=2)
df_R5_L1 = pd.read_excel(filepath, sheet_name=3)

In [None]:
# Define functions for trimming audio files
import librosa
import soundfile as sf

def get_row_info(df, task):
    """
    Returns the file name, offset, and duration of a given task

    Parameters
    ----------
    df : pandas dataframe
        Dataframe containing recording information
    task : string
        Task to get information for

    Returns
    -------
    file_name : string
        Name of the file
    offset : float
        Offset of the task
    duration : float
        Duration of the task
    """

    file_name = df['recording']
    offset = df[task+'_start']
    duration = df[task+'_end'] - df[task+'_start']
    
    return file_name, offset, duration

def trim_files(df, audio_files, task, trimmed_files, offsets, checklist):
    """
    Trims audio files based on annotated start and end times of a given task

    Parameters
    ----------
    df : pandas dataframe
        Dataframe containing recording information
    audio_files : list
        List of audio files
    task : string
        Task to get information for
    trimmed_files : list
        List of trimmed output audio files
    offsets : list
        List of offsets for the audio files
    checklist : list
        List of files where annotations are (partly) outside the duration of the audio file

    Returns
    -------
    new_files : list
        List of trimmed output audio files
    offsets : list
        List of offsets for the audio files
    checklist : list
        List of files where annotations are (partly) outside the duration of the audio file
    """

    for index, row in df.iterrows():
        file_name, offset, duration = get_row_info(row, task)
        print(file_name)
        
        # find file path of audio file
        file_path = [x for x in audio_files if file_name in x][0]
        print(file_path)
        group = '/'.join(file_path.split('/')[-2:-1])
        path = './trimmed_audio/' + group + '/' + task + '/'

        # read audio(with offset & duration)
        duration_sec = librosa.get_duration(filename=file_path)
        if (offset + duration) > duration_sec:
            print("WARNING, end of ", str(task), "annotation is outside file")
            checklist.append(file_path)
        y, sr = librosa.load(file_path, offset=offset, duration=duration)
    
        # write audio signal to new file
        new_filename = path + file_name + '_' + task + '.wav'
        trimmed_files.append(new_filename)
        offsets.append(offset)
        sf.write(new_filename, y, sr)
        print(new_filename + " written")
    return trimmed_files, offsets, checklist

def trim_all(all_dfs, tasks):
    """
    Trims all audio files for all tasks

    Parameters
    ----------
    all_dfs : list
        List of dataframes containing recording information
    tasks : list
        List of tasks to get information for

    Returns
    -------
    trimmed_files : list
        List of trimmed output audio files
    offsets : list
        List of offsets for the audio files
    checklist : list
        List of files where annotations are (partly) outside the duration of the audio file
    """

    trimmed_files = []
    offsets = []
    checklist = []

    # trim all files for all tasks
    for task in tasks:
        for df in all_dfs:
            trimmed_files, offsets, checklist = trim_files(df, audio_files, task, trimmed_files, offsets, checklist)

    return trimmed_files, offsets, checklist

In [None]:
# Trim all audio files

all_dfs = [df_R1, df_R1_L1, df_R5, df_R5_L1]
tasks = ['formal', 'informal']

trimmed_files, offsets, checklist = trim_all(all_dfs, tasks)

In [None]:
# Define functions for transcribing trimmed audio files

def run_whisperx(audio_file, device, compute_type, offset):
    """
    Transcribes audio file using whisperx, and change the start and end times to match the original audio file 

    Parameters
    ----------
    audio_file : string
        Path to audio file
    device : string
        Device to use for transcription (cuda or cpu)
    compute_type : string
        Compute type to use for transcription (float16 or int8)
    offset : float
        start time of the audio segment

    Returns
    -------
    result : dict
        Dictionary containing the transcription results
    """

    audio_whisperx = whisperx.load_audio(audio_file)
    model = whisperx.load_model(whisperx_model, device, compute_type=compute_type)
    result = model.transcribe(audio_whisperx, batch_size=batch_size)
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio_file, device, return_char_alignments=False)
    
    for segment in result["segments"]:
        del segment['words']
        segment['start'] += offset
        segment['end'] += offset
        
    return result

def transcribe_all(trimmed_files, offsets, device, compute_type, writer_options):
    """
    Transcribes all audio files

    Parameters
    ----------
    trimmed_files : list
        List of trimmed audio files
    offsets : list
        List of offsets for the audio files
    device : string
        Device to use for transcription (cuda or cpu)
    compute_type : string
        Compute type to use for transcription (float16 or int8)
    writer_options : dict
        Dictionary containing the writer options

    Returns
    -------
    None
    """ 

    for idx, file in enumerate(trimmed_files):
        audio_file = file
        print(file)
        result = run_whisperx(audio_file, device, compute_type, offset = offsets[idx])

        output_directory = "/data/volume_2/transcripts/" + '/'.join(file.split('/')[-3:-2]) + '/'
        writer = whisper.utils.get_writer("all", output_directory)
        writer(result, audio_file, writer_options)

In [None]:
# Set all required Whisper variables and transcribe all audio files

import whisper
import whisperx
import gc

device = "cuda" 
batch_size = 16                 # reduce if low on GPU mem
compute_type = "float16"        # change to "int8" if low on GPU mem (may reduce accuracy)
whisperx_model = "large-v2"     # options: "base", "small", "medium"

writer_options = {"max_line_width":None,
                  "max_line_count":None,
                  "highlight_words":None}

#Create transcripts for all audio files
transcribe_all(trimmed_files, offsets, device, compute_type, writer_options)