In [2]:
from pydub import AudioSegment
import os

def batch_convert_mp3_to_wav(input_folder, output_folder):
    """
    Converts all MP3 files in the input_folder to WAV format in the output_folder.

    :param input_folder: Directory containing MP3 files.
    :param output_folder: Directory to save WAV files.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # List all MP3 files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.lower().endswith('.mp3'):
            input_file = os.path.join(input_folder, file_name)
            output_file_name = os.path.splitext(file_name)[0] + '.wav'
            output_file = os.path.join(output_folder, output_file_name)
            print(f"Converting {input_file} to {output_file}")
            try:
                # Load the MP3 file
                audio = AudioSegment.from_mp3(input_file)
                # Export as WAV
                audio.export(output_file, format="wav")
            except Exception as e:
                print(f"Error converting {input_file}: {e}")

# Example usage
if __name__ == "__main__":
    input_dir = "data"  # Replace with your input directory path
    output_dir = "wavs"  # Replace with your output directory path
    batch_convert_mp3_to_wav(input_dir, output_dir)


Converting data\clip_1.mp3 to wavs\clip_1.wav
Converting data\clip_10.mp3 to wavs\clip_10.wav
Converting data\clip_11.mp3 to wavs\clip_11.wav
Converting data\clip_12.mp3 to wavs\clip_12.wav
Converting data\clip_13.mp3 to wavs\clip_13.wav
Converting data\clip_14.mp3 to wavs\clip_14.wav
Converting data\clip_141.mp3 to wavs\clip_141.wav
Converting data\clip_142.mp3 to wavs\clip_142.wav
Converting data\clip_15.mp3 to wavs\clip_15.wav
Converting data\clip_16.mp3 to wavs\clip_16.wav
Converting data\clip_17.mp3 to wavs\clip_17.wav
Converting data\clip_18.mp3 to wavs\clip_18.wav
Converting data\clip_19.mp3 to wavs\clip_19.wav
Converting data\clip_2.mp3 to wavs\clip_2.wav
Converting data\clip_20.mp3 to wavs\clip_20.wav
Converting data\clip_21.mp3 to wavs\clip_21.wav
Converting data\clip_22.mp3 to wavs\clip_22.wav
Converting data\clip_23.mp3 to wavs\clip_23.wav
Converting data\clip_24.mp3 to wavs\clip_24.wav
Converting data\clip_25.mp3 to wavs\clip_25.wav
Converting data\clip_26.mp3 to wavs\clip

In [3]:
import os


def rename_wave_files(folder_path):
    files = os.listdir(folder_path)
    wav_files = [f for f in files if f.lower().endswith('.wav')]

    print(f'Folder path: {folder_path}')
    print(f'All files in the folder: {files}')
    print(f'Wave files found: {len(wav_files)}')

    for index, wav_file in enumerate(wav_files, start=1):
        old_path = os.path.join(folder_path, wav_file)
        new_path = os.path.join(folder_path, f'{index}.wav')
        os.rename(old_path, new_path)
        print(f'Renamed {old_path} to {new_path}')


if __name__ == "__main__":
    folder_path = r'wavs'  # Change this to your wav file folder
    rename_wave_files(folder_path)
    print("Wave files renamed successfully!")

Folder path: wavs
All files in the folder: ['clip_1.wav', 'clip_10.wav', 'clip_11.wav', 'clip_12.wav', 'clip_13.wav', 'clip_14.wav', 'clip_15.wav', 'clip_16.wav', 'clip_17.wav', 'clip_18.wav', 'clip_19.wav', 'clip_2.wav', 'clip_20.wav', 'clip_21.wav', 'clip_22.wav', 'clip_23.wav', 'clip_24.wav', 'clip_25.wav', 'clip_26.wav', 'clip_27.wav', 'clip_28.wav', 'clip_29.wav', 'clip_3.wav', 'clip_30.wav', 'clip_31.wav', 'clip_32.wav', 'clip_33.wav', 'clip_34.wav', 'clip_35.wav', 'clip_36.wav', 'clip_37.wav', 'clip_38.wav', 'clip_39.wav', 'clip_4.wav', 'clip_40.wav', 'clip_41.wav', 'clip_42.wav', 'clip_43.wav', 'clip_44.wav', 'clip_45.wav', 'clip_46.wav', 'clip_47.wav', 'clip_48.wav', 'clip_49.wav', 'clip_5.wav', 'clip_50.wav', 'clip_51.wav', 'clip_52.wav', 'clip_53.wav', 'clip_54.wav', 'clip_55.wav', 'clip_56.wav', 'clip_57.wav', 'clip_59.wav', 'clip_6.wav', 'clip_60.wav', 'clip_61.wav', 'clip_62.wav', 'clip_63.wav', 'clip_64.wav', 'clip_65.wav', 'clip_66.wav', 'clip_67.wav', 'clip_68.wav', 'c

In [7]:
import os
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Define the path where the .wav files are located
wav_directory = "wavs"  # Add your wav dir

# Define the output file name
output_file = os.path.join(wav_directory, "list.txt")

# Define the range of .wav files (1 to 165)
wav_files_range = range(1, 89)

# Initialize the list to store file paths and transcripts
file_and_transcripts = []

# Initialize the wav2vec model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

# Set model to evaluation mode
model.eval()

# Iterate through the .wav files
for i in wav_files_range:
    wav_file = os.path.join(wav_directory, f"{i}.wav")

    # Check if the .wav file exists
    if os.path.exists(wav_file):
        # Recognize the speech in the .wav file
        try:
            waveform, sample_rate = torchaudio.load(wav_file)

            # Convert to mono if necessary
            if waveform.shape[0] > 1:
                waveform = waveform.mean(dim=0, keepdim=True)

            # Resample if necessary
            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
                waveform = resampler(waveform)

            # Remove extra dimensions
            waveform = waveform.squeeze()

            # Add batch dimension if necessary
            if waveform.dim() == 1:
                waveform = waveform.unsqueeze(0)

            # Convert to numpy array
            waveform = waveform.numpy()

            # Process with the processor
            input_values = processor(waveform, sampling_rate=16000, return_tensors="pt").input_values

            # Run inference
            with torch.no_grad():
                logits = model(input_values).logits

            # Get predicted tokens and decode
            predicted_ids = torch.argmax(logits, dim=-1)
            transcript = processor.decode(predicted_ids[0])

        except Exception as e:
            print(f"Error processing file {wav_file}: {str(e)}")
            continue

        # Append the desired path format and transcript to the list
        file_and_transcripts.append(f"/kaggle/working/wavs/{i}.wav|{transcript}")
    else:
        print(f"File not found: {wav_file}")

# Write the file paths and transcripts to the output file
with open(output_file, "w") as f:
    for line in file_and_transcripts:
        f.write(f"{line}\n")

print(f"File '{output_file}' created successfully.")


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

File 'wavs\list.txt' created successfully.


In [8]:
import os
import librosa
import soundfile as sf

input_path = "wavs"  # Add your path
output_path = "preprocessed"  # Add your path

if not os.path.exists(output_path):
    os.makedirs(output_path)

for filename in os.listdir(input_path):
    if filename.endswith(".wav"):
        # Load the .wav file
        filepath = os.path.join(input_path, filename)
        y, sr = librosa.load(filepath, sr=22050)

        # Trim silence
        trimmed_audio, _ = librosa.effects.trim(y, top_db=20)

        # Normalize audio
        normalized_audio = librosa.util.normalize(trimmed_audio)

        # Save processed .wav file to the output folder
        output_filepath = os.path.join(output_path, filename)
        sf.write(output_filepath, normalized_audio, sr, subtype='PCM_16')

print("All .wav files have been preprocessed and saved to the output folder.")

All .wav files have been preprocessed and saved to the output folder.


In [10]:
import os
import shutil
import taglib


def update_metadata(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for i in range(1, 89):  # change  this to the number of files in your folder + 1
        input_file = os.path.join(input_folder, f"{i}.wav")
        output_file = os.path.join(output_folder, f"{i}.wav")

        if os.path.exists(input_file):
            # Load WAV file and update metadata
            with taglib.File(input_file) as audio:
                # Set the title to match the file name without the extension
                audio.tags["TITLE"] = [f"{i}"]
                # Set the track number to match the file name without the extension
                audio.tags["TRACKNUMBER"] = [f"{i}"]

                # Save updated WAV file
                audio.save()

            # Copy the updated file to the output folder instead of moving it
            shutil.copy2(input_file, output_file)

            print(
                f"Updated metadata for {i}.wav: title='{i}', track number={i}")  # Update the print statement as well
        else:
            print(f"File {i}.wav not found.")


if __name__ == "__main__":
    input_folder = "preprocessed"  # your path
    output_folder = "final"  # your path
    update_metadata(input_folder, output_folder)

Updated metadata for 1.wav: title='1', track number=1
Updated metadata for 2.wav: title='2', track number=2
Updated metadata for 3.wav: title='3', track number=3
Updated metadata for 4.wav: title='4', track number=4
Updated metadata for 5.wav: title='5', track number=5
Updated metadata for 6.wav: title='6', track number=6
Updated metadata for 7.wav: title='7', track number=7
Updated metadata for 8.wav: title='8', track number=8
Updated metadata for 9.wav: title='9', track number=9
Updated metadata for 10.wav: title='10', track number=10
Updated metadata for 11.wav: title='11', track number=11
Updated metadata for 12.wav: title='12', track number=12
Updated metadata for 13.wav: title='13', track number=13
Updated metadata for 14.wav: title='14', track number=14
Updated metadata for 15.wav: title='15', track number=15
Updated metadata for 16.wav: title='16', track number=16
Updated metadata for 17.wav: title='17', track number=17
Updated metadata for 18.wav: title='18', track number=18
U