In [None]:
'''
Before using the code, first click on "Runtime" → "Change runtime type",
then select "T4-GPU" for the "Hardware accelerator" option, and choose "2025.07" for the "Runtime version"
 (to adapt to the Spleeter version, as the latest Runtime with default Python 3.12 version is incompatible with Spleeter)
'''
!apt-get install -y ffmpeg
!pip install spleeter==2.4.2


In [None]:

!pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git

!pip install torch torchvision torchaudio
!pip install pandas tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import glob
import torch
import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import Audio, display


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


AUDIO_DIR = "/content/drive/MyDrive/MEMD_audio"  # Audio file storage path
OUTPUT_DIR = "/content/drive/MyDrive/separated_vocals"  # The output path of the separated voices
RESULTS_DIR = "/content/drive/MyDrive/results"  # Result path
BATCH_SIZE = 50


!mkdir -p {OUTPUT_DIR}
!mkdir -p {RESULTS_DIR}


def get_audio_files(directory, extensions=['.mp3', '.wav']):
    files = []
    for ext in extensions:
        files.extend(glob.glob(os.path.join(directory, f"*{ext}")))
    return files


def batch_process(file_list, batch_size=100):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i + batch_size]


def separate_vocals_batch(audio_files, output_dir):
    print(f"processing {len(audio_files)} audio files...")


    for i, audio_file in enumerate(tqdm(audio_files)):
        file_name = os.path.basename(audio_file)
        song_id = os.path.splitext(file_name)[0]
        song_output_dir = os.path.join(output_dir, song_id)

        print(f"处理 {i+1}/{len(audio_files)}: {file_name}")

        !spleeter separate -p spleeter:2stems -o {output_dir} "{audio_file}"

        if (i+1) % 20 == 0:
            print(f"Finished: {i+1}/{len(audio_files)} files")

def main_separation():
    all_audio_files = get_audio_files(AUDIO_DIR)
    print(f"found {len(all_audio_files)} audio files in total")

    with open(f"{RESULTS_DIR}/process_parameters.txt", 'w') as f:
        f.write(f"processing time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"audio dictionary: {AUDIO_DIR}\n")
        f.write(f"output dictionary: {OUTPUT_DIR}\n")
        f.write(f"result dictionary: {RESULTS_DIR}\n")
        f.write(f"audio files count: {len(all_audio_files)}\n")
        f.write(f"device: {device}\n")

    for batch_idx, batch_files in enumerate(batch_process(all_audio_files, BATCH_SIZE)):
        print(f"processing batch {batch_idx+1}: {len(batch_files)} files")
        print("=== Start voice separation ===")
        separate_vocals_batch(batch_files, OUTPUT_DIR)

        print(f"batch {batch_idx+1} finished!")

    print("All voice separation has been completed！")

if __name__ == "__main__":
    main_separation()
else:
    main_separation()

In [None]:
import os
import glob
import torch
import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import Audio, display

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device: {device}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

OUTPUT_DIR = "/content/drive/MyDrive/separated_vocals"  # The output path of the separated voices
RESULTS_DIR = "/content/drive/MyDrive/results"  # Result path
BATCH_SIZE = 50
WHISPER_MODEL = "medium"  # options：tiny, base, small, medium, large

!mkdir -p {RESULTS_DIR}

def batch_process(file_list, batch_size=50):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i + batch_size]

def transcribe_vocals_batch(vocals_dir, whisper_model="small", language="en"):
    import whisper
    print(f"Loading Whisper model: {whisper_model}")
    model = whisper.load_model(whisper_model, device=device)
    vocals_files = glob.glob(os.path.join(vocals_dir, "*/vocals.wav"))
    print(f"found {len(vocals_files)} voice files")
    results = []

    for i, vocal_file in enumerate(tqdm(vocals_files)):
        song_id = os.path.basename(os.path.dirname(vocal_file))
        try:
            print(f"transcribing {i+1}/{len(vocals_files)}: {song_id}")
            start_time = time.time()
            result = model.transcribe(vocal_file, language=language)
            processing_time = time.time() - start_time
            results.append({
                "song_id": song_id,
                "lyrics": result["text"],
                "processing_time": processing_time,
                "segments": result["segments"]
            })
            print(f"Time-consuming: {processing_time:.2f}s")
            print(f"result: {result['text'][:100]}..." if len(result["text"]) > 100 else result["text"])

            if torch.cuda.is_available() and (i+1) % 10 == 0:
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"error when processing {song_id} : {str(e)}")
            results.append({
                "song_id": song_id,
                "error": str(e)
            })

        if (i+1) % 50 == 0:
            temp_df = pd.DataFrame(results)
            temp_df.to_csv(f"{RESULTS_DIR}/whisper_lyrics_batch_{(i+1)//50}.csv", index=False)
            with open(f"{RESULTS_DIR}/whisper_details_batch_{(i+1)//50}.json", 'w', encoding='utf-8') as f:
                import json
                json.dump(results, f, ensure_ascii=False, indent=4)

    final_df = pd.DataFrame([{
        "song_id": r["song_id"],
        "lyrics": r.get("lyrics", ""),
        "error": r.get("error", ""),
        "processing_time": r.get("processing_time", 0)
    } for r in results])

    final_df.to_csv(f"{RESULTS_DIR}/whisper_lyrics_all.csv", index=False)
    with open(f"{RESULTS_DIR}/whisper_details_all.json", 'w', encoding='utf-8') as f:
        import json
        json.dump(results, f, ensure_ascii=False, indent=4)

    return results

def main_transcription():
    with open(f"{RESULTS_DIR}/transcription_parameters.txt", 'w') as f:
        f.write(f"Time-consuming: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"output dir: {OUTPUT_DIR}\n")
        f.write(f"reslut dir: {RESULTS_DIR}\n")
        f.write(f"Whisper model: {WHISPER_MODEL}\n")
        f.write(f"device: {device}\n")

    print("=== Start lyrics recognition ===")
    transcribe_results = transcribe_vocals_batch(OUTPUT_DIR, WHISPER_MODEL)
    print("Lyrics recognition completed, results saved.")

    if transcribe_results:
        sample = np.random.choice(transcribe_results)
        if "lyrics" in sample:
            print("\n=== Random result example ===")
            print(f"song ID: {sample['song_id']}")
            print(f"lyric content:\n{sample['lyrics']}")

if __name__ == "__main__":
    main_transcription()
else:
    main_transcription()