In [4]:
import os
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pydub import AudioSegment
import shutil
import librosa
import soundfile as sf
import taglib


In [None]:
# validate that we have the correct sentence format for training
def sentence_validator(file_path):
    sentences = []
    with open(file_path, "r") as f:
        # get sentence data
        sentences = f.read().split("\n")[:-1]

    with open("validated_sentences.txt", "w") as f:
        # replace all commas with blanks, and replace ? and ! with .
        for sentence in sentences:
            sentence = sentence.replace(",", "").replace("?", ".").replace("!", ".").replace("â€™", "'")
            f.write(f"{sentence}\n")
sentence_validator("validated_sentences.txt")

In [8]:
# convert m4a to wav files for processing and rename it to the index number
def m4a_to_wav(folder_path):
    m4a_files = [f for f in os.listdir(folder_path)]
    m4a_files = sorted(m4a_files, key=lambda x:int(x.split(" ")[-1].split(".")[0]))
    for idx, m4a_file in enumerate(m4a_files):
        audio = AudioSegment.from_file(f"{folder_path}/{m4a_file}", format="m4a")
        audio.export(f"wav_files/{idx+1}.wav", format="wav")
m4a_to_wav("raw_data")

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
wav_files = [f for f in os.listdir("wav_files")]
wav_files = sorted(wav_files, key=lambda x:int(x.split(".")[0]))
file_and_transcripts = []
transcript = []
with open("validated_sentences.txt", "r") as f:
    transcript = f.read().split("\n")[:-1]

# # Write the file paths and transcripts to the output file
with open("list.txt", "w") as f:
    for idx, file in enumerate(wav_files):
        f.write(f"wav_files/{file}|{transcript[idx]}\n")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def preprocess_audio(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            # Load the .wav file
            filepath = os.path.join(folder_path, filename)
            y, sr = librosa.load(filepath, sr=22050)

            # Trim silence
            trimmed_audio, _ = librosa.effects.trim(y, top_db=20)

            # Normalize audio
            normalized_audio = librosa.util.normalize(trimmed_audio)

            # Save processed .wav file to the output folder
            output_filepath = os.path.join("outputs", filename)
            sf.write(output_filepath, normalized_audio, sr, subtype='PCM_16')
preprocess_audio("wav_files")

In [5]:
def update_metadata(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for i in range(1, 165):  # change  this to the number of files in your folder + 1
        input_file = os.path.join(input_folder, f"{i}.wav")
        output_file = os.path.join(output_folder, f"{i}.wav")

        if os.path.exists(input_file):
            # Load WAV file and update metadata
            with taglib.File(input_file) as audio:
                # Set the title to match the file name without the extension
                audio.tags["TITLE"] = [f"{i}"]
                # Set the track number to match the file name without the extension
                audio.tags["TRACKNUMBER"] = [f"{i}"]

                # Save updated WAV file
                audio.save()

            # Copy the updated file to the output folder instead of moving it
            shutil.copy2(input_file, output_file)

            print(
                f"Updated metadata for {i}.wav: title='{i}', track number={i}")  # Update the print statement as well
        else:
            print(f"File {i}.wav not found.")
update_metadata("outputs", "wavs")

Updated metadata for 1.wav: title='1', track number=1
Updated metadata for 2.wav: title='2', track number=2
Updated metadata for 3.wav: title='3', track number=3
Updated metadata for 4.wav: title='4', track number=4
Updated metadata for 5.wav: title='5', track number=5
Updated metadata for 6.wav: title='6', track number=6
Updated metadata for 7.wav: title='7', track number=7
Updated metadata for 8.wav: title='8', track number=8
Updated metadata for 9.wav: title='9', track number=9
Updated metadata for 10.wav: title='10', track number=10
Updated metadata for 11.wav: title='11', track number=11
Updated metadata for 12.wav: title='12', track number=12
Updated metadata for 13.wav: title='13', track number=13
Updated metadata for 14.wav: title='14', track number=14
Updated metadata for 15.wav: title='15', track number=15
Updated metadata for 16.wav: title='16', track number=16
Updated metadata for 17.wav: title='17', track number=17
Updated metadata for 18.wav: title='18', track number=18
U

In [3]:
def is_utf8_without_bom(filepath):
    try:
        with open(filepath, 'rb') as file:
            content = file.read()

        # Check for BOM (UTF-8 BOM is: 0xEF, 0xBB, 0xBF)
        if content.startswith(b'\xef\xbb\xbf'):
            return False, "UTF-8 with BOM"

        # Try decoding as UTF-8
        content.decode('utf-8')
        return True, "UTF-8 without BOM"
    except UnicodeDecodeError:
        return False, "Not UTF-8"

# Example Usage
filepath = 'list.txt'
is_utf8, description = is_utf8_without_bom(filepath)
print(f"File Encoding Check: {description}")

File Encoding Check: UTF-8 without BOM


In [8]:
with open("list.txt", "r") as f:
    lines = f.read().split("\n")[:-1]
print(lines)

['/content/TTS-TT2/wavs/1.npy|The quick brown fox jumps over the lazy dog.', '/content/TTS-TT2/wavs/2.npy|A journey of a thousand miles begins with a single step.', '/content/TTS-TT2/wavs/3.npy|She sells seashells by the seashore.', '/content/TTS-TT2/wavs/4.npy|How much wood would a woodchuck chuck if a woodchuck could chuck wood.', '/content/TTS-TT2/wavs/5.npy|Peter Piper picked a peck of pickled peppers.', '/content/TTS-TT2/wavs/6.npy|The early bird catches the worm.', '/content/TTS-TT2/wavs/7.npy|Practice makes perfect.', '/content/TTS-TT2/wavs/8.npy|All work and no play makes Jack a dull boy.', '/content/TTS-TT2/wavs/9.npy|Actions speak louder than words.', '/content/TTS-TT2/wavs/10.npy|A watched pot never boils.', '/content/TTS-TT2/wavs/11.npy|Can you tell me how to get to the nearest station.', '/content/TTS-TT2/wavs/12.npy|What time does the movie start.', '/content/TTS-TT2/wavs/13.npy|Would you like a cup of tea or coffee.', "/content/TTS-TT2/wavs/14.npy|I think it's going to r