# 1. Collect Diverse Song Samples

# 2. Preprocess Audio Files

### &nbsp; &nbsp; a. Convert to a Common Format and Normalize the Audio

In [5]:
import os
from pydub import AudioSegment
from pydub.effects import normalize

In [7]:
def convert_and_normalize_to_wav(input_file: str, output_dir: str, db: float = -1.0) -> None:
    # Get the file name and extension
    file_name = os.path.basename(input_file)
    file_name_without_ext = os.path.splitext(file_name)[0]
    
    # Load the audio file
    audio = AudioSegment.from_file(input_file)
    
    # Normalize
    normalized_audio = normalize(audio, headroom=db)
    
    # Create the output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Export as WAV
    output_path = os.path.join(output_dir, f"{file_name_without_ext}_norm.wav")
    normalized_audio.export(output_path, format="wav")
    print(f"Converted and Normalized {input_file} to {output_path}")
    
def batch_convert_and_normalize_to_wav(input_dir, output_dir):
    for root, dirs, files in os.walk(fr"{input_dir}"):
        for file in files:
            if file.lower().endswith(('.mp3', '.aac', '.wma', '.m4a', '.flac', '.ogg', '.aiff')):
                input_path = os.path.join(root, file)
                convert_and_normalize_to_wav(input_path, output_dir)

### &nbsp; &nbsp; d. Remove Non-Vocal Sections

In [1]:
import torch
from demucs import pretrained
from demucs.apply import apply_model
import torchaudio

In [2]:
print(torch.cuda.is_available())

False


In [3]:
def remove_nonvocal_section(input_file: str, output_dir: str) -> None:
    """Removes the Non-Vocal Section of the input_file and saves the separated audio in the output_dir Directory

    Args:
        input_file (str): path of the input audio file
        output_dir (str): directory of the separated vocal section to be saved
    """
    # Load Pretrained Demucs Model
    model = pretrained.get_model('demucs')
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    
    waveform, sample_rate = torchaudio.load(input_file)
    waveform = waveform.to("cuda" if torch.cuda.is_available() else "cpu")
    
    # Apply the model to seperate the sources
    sources = apply_model(model, waveform)
    
    # Extract the vocals assuming the model's order is [drums, bass, ...., vocals]
    vocals = sources[-1]
    
    # Make an Output Directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Saving the vocals to a new file
    torchaudio.save(f"{input_file}", vocals.cpu(), sample_rate)
    
def batch_remove_nonvocal_section(input_dir: str, output_dir: str) -> None:
    """Removes all the background noise of the files of the directory input_dir and saves the Vocal section in the directory output_dir in the form of .wav files

    Args:
        input_dir (str): Directory of the input audio files
        output_dir (str): Path of the Directory where you want to save the separated vocal section of the .wav files
    """
    for root, dirs, files in os.walk(fr"{input_dir}"):
        for file in files:
            if file.lower().endswith(('.mp3', '.aac', '.wma', '.m4a', '.flac', '.ogg', '.aiff')):
                input_path = os.path.join(root, file)
                
                remove_nonvocal_section(input_path, output_dir)

In [10]:
for number, (root, dirs, files) in enumerate(os.walk(r"D:\Sarvesh\VIT Stuff\2024-25 Fall Sem\Song Language Detector\archive\Data\genres_original")):
    print(number)
    print(root)
    print(dirs)
    print(files)
    for file in files :
        print(file)

0
D:\Sarvesh\VIT Stuff\2024-25 Fall Sem\Song Language Detector\archive\Data\genres_original
['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
[]
1
D:\Sarvesh\VIT Stuff\2024-25 Fall Sem\Song Language Detector\archive\Data\genres_original\blues
[]
['blues.00000.wav', 'blues.00001.wav', 'blues.00002.wav', 'blues.00003.wav', 'blues.00004.wav', 'blues.00005.wav', 'blues.00006.wav', 'blues.00007.wav', 'blues.00008.wav', 'blues.00009.wav', 'blues.00010.wav', 'blues.00011.wav', 'blues.00012.wav', 'blues.00013.wav', 'blues.00014.wav', 'blues.00015.wav', 'blues.00016.wav', 'blues.00017.wav', 'blues.00018.wav', 'blues.00019.wav', 'blues.00020.wav', 'blues.00021.wav', 'blues.00022.wav', 'blues.00023.wav', 'blues.00024.wav', 'blues.00025.wav', 'blues.00026.wav', 'blues.00027.wav', 'blues.00028.wav', 'blues.00029.wav', 'blues.00030.wav', 'blues.00031.wav', 'blues.00032.wav', 'blues.00033.wav', 'blues.00034.wav', 'blues.00035.wav', 'blues.00036.wav', 'blue

### &nbsp; &nbsp; c. Split into Fixed-length segments

# 3. Extract Relevent Features

### &nbsp; &nbsp; a. Mel-frequency Cepstral Coefficients (MFCCs)

### &nbsp; &nbsp; b. Spectrograms

# 4. Label Data

# 5. Split Dataset

# 6. Data Augmentation (Optional)