In [1]:
import soundfile as sf
import numpy as np
import librosa
import scipy.io as sio
import json
import csv
import re
import pandas as pd
import os
import subprocess
import shutil

In [None]:
!pip install -r "/content/drive/MyDrive/NYU LH Audio Transcription/requirements.txt" #give own path of requirements.txt

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git

In [None]:
!pip install git+https://github.com/openai/whisper.git

In [23]:
def audio_trans(filepath, phnm_dir="/content/drive/MyDrive/NYU LH Audio Transcription/Phoneme_Dictionary.txt"):  #Give your own Phoneme File Path
    print(f"Processing file: {filepath}")

    filename = os.path.splitext(os.path.basename(filepath))[0]  # filename without extension
    parent_dir = "/content/drive/MyDrive"
    new_dir = "NYU Audio Transcription"

    full_path = os.path.join(parent_dir, new_dir)  # New directory in Google Drive created
    if not os.path.exists(full_path):
        os.makedirs(full_path)
        print(f"Created directory: {full_path}")

    phnm_output_file = os.path.join(full_path, os.path.splitext(os.path.basename(phnm_dir))[0] + ".csv")  # Output file for phoneme dictionary
    if not os.path.exists(phnm_output_file) or os.path.getsize(phnm_output_file) == 0: #converts .txt to .csv phoneme dictionary
        print(f"Creating phoneme CSV at: {phnm_output_file}")
        with open(phnm_dir, 'r') as infile, open(phnm_output_file, 'w', newline='') as outfile:
            csv_writer = csv.writer(outfile)
            csv_writer.writerow(['Word', 'Phoneme'])

            for line in infile:
                parts = line.strip().split(maxsplit=1)
                if len(parts) == 2:
                    word, phoneme = parts
                    csv_writer.writerow([word, phoneme])
                else:
                    print(f"Line Skipped: {line.strip()}")

    mat_files_dir = os.path.join(full_path, ".mat files")  # Directory for saving .mat files
    if not os.path.exists(mat_files_dir):
        os.makedirs(mat_files_dir)
        print(f"Created directory for .mat files: {mat_files_dir}")

    wav_files_dir = os.path.join(full_path, ".wav files")  # Directory for saving .wav files
    if not os.path.exists(wav_files_dir):
        os.makedirs(wav_files_dir)
        print(f"Created directory for .wav files: {wav_files_dir}")

    def convert_mat_wav(filepath): #function to convert .mat to .wav
        print(f"Converting .mat file: {filepath}")
        mat_data = sio.loadmat(filepath)
        print("MAT file keys:", mat_data.keys())

        audio_key = None
        for key, value in mat_data.items():
            if isinstance(value, np.ndarray) and value.ndim in [1, 2]:
                audio_key = key
                break

        if audio_key is None:
            print("No audio file found in .mat")
            return None

        audio_data = mat_data[audio_key]
        wav_file_path = os.path.join(wav_files_dir, filename + ".wav")
        sf.write(wav_file_path, audio_data, samplerate=44100)
        print(f"Audio file saved to {wav_file_path}")
        return wav_file_path

    wav_file_path = None
    if filepath.endswith('.mat'):
        print("The file is a .mat file.")
        new_mat_filepath = os.path.join(mat_files_dir, os.path.basename(filepath))
        if not os.path.exists(new_mat_filepath):
            shutil.copy(filepath, new_mat_filepath) #saves .mat file to respective directory
            print(f"Copied .mat file to: {new_mat_filepath}")
        wav_file_path = convert_mat_wav(new_mat_filepath)
    elif filepath.endswith('.wav'):
        print("The file is a .wav file.")
        wav_file_path = os.path.join(wav_files_dir, filename + ".wav")
        if not os.path.exists(wav_file_path):
           shutil.copy(filepath, wav_file_path) #saves .wav file to respective directory

    if wav_file_path is None:
        print("No valid audio file found.")
        return

    audio_data_path = os.path.join(full_path, ".wav audio data files") #directory to save json data of audio file
    if not os.path.exists(audio_data_path):
        os.makedirs(audio_data_path)
        print(f"Created directory for audio data: {audio_data_path}")

    command = [
        'whisperx',
        wav_file_path,
        '--model', 'medium',
        '--output_dir', audio_data_path,
        '--output_format', 'json',
        '--align_model', 'WAV2VEC2_ASR_LARGE_LV60K_960H'
    ]

    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("WhisperX output:", result.stdout.decode('utf-8'))
    except subprocess.CalledProcessError as e:
        print(f"An error occurred with WhisperX: {e.stderr.decode('utf-8')}")
        return

    to_check = os.path.join(audio_data_path, filename + ".json")

    transcriptions_file_path = os.path.join(full_path, "Transcriptions")
    if not os.path.exists(transcriptions_file_path):
        os.makedirs(transcriptions_file_path)
        print(f"Created directory for transcriptions: {transcriptions_file_path}") #directory to save transcription csv of audio file

    if os.path.exists(to_check):
        print(f"File {to_check} exists.") #only if json of the audio fil exists, further process is done

        with open(to_check, 'r') as json_file:
            data = json.load(json_file)
        transcription_csv_file = os.path.join(transcriptions_file_path, filename + ".csv") #word extraction into csv file
        with open(transcription_csv_file, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(['Word', 'Start', 'End', 'Score'])
            for segment in data['segments']:
                for word_info in segment['words']:
                    csv_writer.writerow([word_info['word'], word_info['start'], word_info['end'], word_info['score']])

        transc_df = pd.read_csv(transcription_csv_file)
        phoneme_df = pd.read_csv(phnm_output_file)

        def rmv_pnct(word): #function to check and remove unwanted punctuations
            return re.sub(r'[.,!"?]', '', word)  # Generalise the preprocessing for all the text.

        transc_df['Word'] = transc_df['Word'].str.upper().apply(rmv_pnct)

        new_df = transc_df.merge(phoneme_df, on='Word', how='left')

        new_df.to_csv(transcription_csv_file, index=False)
        print(f"Final transcription with phonemes saved to {transcription_csv_file}") #transcription file consists of words, phonemes, word time offset, probability

        return new_df

    else:
        print(f"File {to_check} does not exist.")
        return

In [None]:
audio_trans("/content/drive/MyDrive/NYU LH Audio Transcription/.mat files/NY749_AuditoryRepetition.mat")