## Common part

In [60]:
from difflib import SequenceMatcher
import dotenv
import os
import requests
from gradio_client import Client, handle_file
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModel, AutoTokenizer
from pathlib import Path

In [61]:
def remove_spaces_from_filename(filename):
    name, ext = os.path.splitext(filename)
    clean_name = name.replace(' ', '') + ext

    return clean_name

In [62]:
def remove_spaces_in_folder(folder_path):
    for fname in os.listdir(folder_path):
        old_path = os.path.join(folder_path, fname)
        new_name = remove_spaces_from_filename(fname)
        new_path = os.path.join(folder_path, new_name)
        if old_path != new_path:
            os.rename(old_path, new_path)
            print(f"Renamed: {fname} -> {new_name}")

In [63]:
dotenv.load_dotenv(dotenv_path="./.env")

True

In [64]:
BASE_URL = os.getenv("BASE_URL")
MODEL_NAME = os.getenv("WHISPER_MODEL")

In [65]:
MUSIC_DIR = os.getenv('MUSIC_DIR')
MIDI_DIR = os.getenv('MIDI_DIR')
WAV_DIR = os.getenv('WAV_DIR')
SPLIT_DIR = os.getenv('SPLIT_DIR')

In [66]:
remove_spaces_in_folder(MUSIC_DIR)

Renamed: Time in a Bottle - Jim Croce.m4a -> TimeinaBottle-JimCroce.m4a
Renamed: You re in the Army Now - Bolland   Bolland.m4a -> YoureintheArmyNow-BollandBolland.m4a
Renamed: In The Army Now - Status Quo.m4a -> InTheArmyNow-StatusQuo.m4a
Renamed: RADIO - Rammstein.m4a -> RADIO-Rammstein.m4a
Renamed: Радио - RADIO TAPOK.m4a -> Радио-RADIOTAPOK.m4a


## Convert to wav

In [67]:
!bash ./wav_converter.sh

Converting './music/InTheArmyNow-StatusQuo.m4a' to 'InTheArmyNow-StatusQuo.wav'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 14 (GCC)
  configuration: --prefix=/usr --bindir=/usr/bin --datadir=/usr/share/ffmpeg --docdir=/usr/share/doc/ffmpeg --incdir=/usr/include/ffmpeg --libdir=/usr/lib64 --mandir=/usr/share/man --arch=x86_64 --optflags='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-U_FORTIFY_SOURCE,-D_FORTIFY_SOURCE=3 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -march=x86-64 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -mtls-dialect=gnu2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer ' --extra-ldflags='-Wl,-z,relro -Wl,--as-needed -Wl,-z,pack-relative-relocs -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -Wl,--build-id=sha1 -specs=/usr/lib/rpm/redhat/red

## Split voice and music

In [69]:
!bash ./spleet.sh "{WAV_DIR}" "{SPLIT_DIR}"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Validating archive checksum
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted
INFO:spleeter:File /audio/spleeter_temp/InTheArmyNow-StatusQuo/accompaniment.wav written succesfully
INFO:spleeter:File /audio/spleeter_temp/InTheArmyNow-StatusQuo/vocals.wav written succesfully
INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Validating archive checksum
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted
INFO:spleeter:File /audio/spleeter_temp/RADIO-Rammstein/accompaniment.wav written succesfully
INFO:spleeter:File /audio/spleeter_temp/RADIO-Rammstein/vocals.wav written succesfully
INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.t

## Create midi for accompaniment files

In [70]:
accomp_dir = os.path.join(SPLIT_DIR, 'accomp')
print(accomp_dir)

!bash ./midi_converter.sh "{accomp_dir}" "{MIDI_DIR}"

for filename in os.listdir(MIDI_DIR):
    if filename.endswith("_basic_pitch.mid"):
        old_path = os.path.join(MIDI_DIR, filename)
        new_name = filename.replace("_basic_pitch", "")
        new_path = os.path.join(MIDI_DIR, new_name)
        os.rename(old_path, new_path)

./split/accomp
Обрабатывается: InTheArmyNow-StatusQuo.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-06-08 10:37:30.651611: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-08 10:37:30.651652: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-06-08 10:37:33.181293: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-06-08 10:37:33.181323: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2025-06-08 10:37:33.181343: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (bf872fdda58d): /proc/driver/nvidia/version does not exist
2025-06-08 10:37:33.181631: I tensorflow/core/platform/cpu_

# Compare compositions by text

In [71]:
client = Client(BASE_URL)

Loaded as API: http://localhost:8000/ ✔


In [72]:
def transcribe_file(path):
    try:
        result = client.predict(
            file_path=handle_file(path),
            model=MODEL_NAME,
            task='transcribe',
            temperature=0,
            stream=False,
            api_name='/predict'
        )

        return result
    except requests.RequestException as e:
        print(f"Error during transcription: {e}")

In [73]:
def compute_similarity_paraphrase(text1: str, text2: str) -> float:
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Быстрая и точная модель

    embedding1 = model.encode([text1], convert_to_tensor=True)
    embedding2 = model.encode([text2], convert_to_tensor=True)

    similarity = util.cos_sim(embedding1, embedding2)

    return similarity.item()

In [77]:
def process_all_files(vocal_dir):
    transcriptions = {}

    for filename in os.listdir(vocal_dir):
        if filename.lower().endswith(('.mp3', '.wav', '.m4a')):
            file_path = os.path.join(vocal_dir, filename)
            print(f"Transcribing: {filename}")

            text = transcribe_file(file_path)

            if text:
                base = os.path.splitext(filename)[0]
                parts = base.split('-')
                song_name = parts[0]
                author = parts[1]

                transcriptions[(song_name, author)] = text

    return transcriptions

In [75]:
from itertools import combinations


def compare_all_pairs(transcriptions, threshold=0.5):
    results = []
    for (song1, text1), (song2, text2) in combinations(transcriptions.items(), 2):
        similarity = compute_similarity_paraphrase(text1, text2)
        if similarity > threshold:

            results.append((song1, song2, similarity))

    return results

In [78]:
vocal_dir = os.path.join(SPLIT_DIR, 'vocals')

transcription_res = process_all_files(vocal_dir)

Transcribing: RADIO-Rammstein.wav
Transcribing: YoureintheArmyNow-BollandBolland.wav
Transcribing: Радио-RADIOTAPOK.wav
Transcribing: TimeinaBottle-JimCroce.wav
Transcribing: InTheArmyNow-StatusQuo.wav


In [79]:
for song, text in transcription_res.items():
    print('\n--------------------------------')
    print(song)
    print(text)


--------------------------------
('RADIO', 'Rammstein')
You will not be able to hear, not see, speak, or hear, but every night for one or two hours I am this world, every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver I will not be able to hear, not see, speak, or hear, but every night a little joy, my organ is on the world receiver

--------------------------------
('YoureintheArmyNow', 'BollandBolland'

In [80]:
similar_pairs = compare_all_pairs(transcription_res, threshold=0.1)

In [81]:
print("\nSongs with significant similarity (> 0.5):")
for song1, song2, sim in similar_pairs:
    print(f"{song1} ↔ {song2}: similarity = {sim:.3f}")


Songs with significant similarity (> 0.5):
('RADIO', 'Rammstein') ↔ ('YoureintheArmyNow', 'BollandBolland'): similarity = 0.307
('RADIO', 'Rammstein') ↔ ('Радио', 'RADIOTAPOK'): similarity = 0.150
('RADIO', 'Rammstein') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.312
('RADIO', 'Rammstein') ↔ ('InTheArmyNow', 'StatusQuo'): similarity = 0.337
('YoureintheArmyNow', 'BollandBolland') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.244
('YoureintheArmyNow', 'BollandBolland') ↔ ('InTheArmyNow', 'StatusQuo'): similarity = 0.818
('TimeinaBottle', 'JimCroce') ↔ ('InTheArmyNow', 'StatusQuo'): similarity = 0.207


### musicBERT

In [82]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from symusic import Score
import os
from miditok import MIDILike, TSD, REMI, TokenizerConfig

In [83]:
def tokenize_midi_file(tokenizer, midi_path):
    score = Score(midi_path)
    tokseqs = tokenizer(score)

    return np.array(tokseqs[0].ids)


In [84]:
def tokenize_with_miditok(tokenizer_class, midi_path, config_args=None):
    config = TokenizerConfig(**(config_args or {}))
    tokenizer = tokenizer_class(config)
    score = Score(midi_path)
    tokens = tokenizer(score)

    # return [np.array(tok) for tok in tokens]
    return np.array(tokens[0].ids)

In [85]:
def compare_token_vectors(vec1, vec2):
    min_len = min(len(vec1), len(vec2))
    vec1 = vec1[:min_len].reshape(1, -1)
    vec2 = vec2[:min_len].reshape(1, -1)

    return cosine_similarity(vec1, vec2)[0][0]

In [96]:
def process_all_midis(tokenize_class, midi_dir):
    tokenized_music = {}

    for filename in os.listdir(midi_dir):
        if filename.lower().endswith('.mid'):
            file_path = os.path.join(midi_dir, filename)
            print(f"Tokenizing midi file: {filename}")
            tokens = tokenize_with_miditok(tokenize_class, file_path)

            if len(tokens):
                base = os.path.splitext(filename)[0]
                parts = base.split('-')
                song_name = parts[0]
                author = parts[1]

                tokenized_music[(song_name, author)] = tokens

    return tokenized_music

In [89]:
def compare_all_tokenized_files(tokenized_files, threshold=0.5):
    results = []
    for (song1, tokens1), (song2, tokens2) in combinations(tokenized_files.items(), 2):
        similarity = compare_token_vectors(tokens1, tokens2)
        if similarity > threshold:

            results.append((song1, song2, similarity))

    return results

In [105]:
tokenization_res = process_all_midis(MIDILike, MIDI_DIR)

Tokenizing midi file: InTheArmyNow-StatusQuo.mid
Tokenizing midi file: Радио-RADIOTAPOK.mid
Tokenizing midi file: RADIO-Rammstein.mid
Tokenizing midi file: YoureintheArmyNow-BollandBolland.mid
Tokenizing midi file: TimeinaBottle-JimCroce.mid


In [106]:
for song, tokens in tokenization_res.items():
    print('\n--------------------------------')
    print(song)
    print(tokens)


--------------------------------
('InTheArmyNow', 'StatusQuo')
[220  33 190 ... 150 214  99]

--------------------------------
('Радио', 'RADIOTAPOK')
[213  19 194 ... 189 217 135]

--------------------------------
('RADIO', 'Rammstein')
[218  19 195 ... 143 212 149]

--------------------------------
('YoureintheArmyNow', 'BollandBolland')
[220  48 197 ... 136 157 145]

--------------------------------
('TimeinaBottle', 'JimCroce')
[218  26 187 ... 190 214 105]


In [107]:
similar_pairs = compare_all_tokenized_files(tokenization_res, threshold=0.1)

In [108]:
print("\nSongs with significant similarity (> 0.5):")
for song1, song2, sim in similar_pairs:
    print(f"{song1} ↔ {song2}: similarity = {sim:.3f}")


Songs with significant similarity (> 0.5):
('InTheArmyNow', 'StatusQuo') ↔ ('Радио', 'RADIOTAPOK'): similarity = 0.794
('InTheArmyNow', 'StatusQuo') ↔ ('RADIO', 'Rammstein'): similarity = 0.793
('InTheArmyNow', 'StatusQuo') ↔ ('YoureintheArmyNow', 'BollandBolland'): similarity = 0.806
('InTheArmyNow', 'StatusQuo') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.813
('Радио', 'RADIOTAPOK') ↔ ('RADIO', 'Rammstein'): similarity = 0.794
('Радио', 'RADIOTAPOK') ↔ ('YoureintheArmyNow', 'BollandBolland'): similarity = 0.806
('Радио', 'RADIOTAPOK') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.803
('RADIO', 'Rammstein') ↔ ('YoureintheArmyNow', 'BollandBolland'): similarity = 0.806
('RADIO', 'Rammstein') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.810
('YoureintheArmyNow', 'BollandBolland') ↔ ('TimeinaBottle', 'JimCroce'): similarity = 0.816


### Наброски

In [36]:
tokenizer = REMI()

midi_path_1 = os.path.join(MIDI_DIR, 'bb.mid')
midi_path_2 = os.path.join(MIDI_DIR, 'quo.mid')

tokens1 = tokenize_midi_file(tokenizer, midi_path_1)
tokens2 = tokenize_midi_file(tokenizer, midi_path_2)

similarity = compare_token_vectors(tokens1, tokens2)

print(f"Cosine similarity between MIDI tracks: {similarity:.4f}")

Cosine similarity between MIDI tracks: 0.7975


In [47]:
tokens_midilike_1 = tokenize_with_miditok(MIDILike, midi_path_1)
tokens_midilike_2 = tokenize_with_miditok(MIDILike, midi_path_2)

# TSD
tokens_tsd_1 = tokenize_with_miditok(TSD, midi_path_1)
tokens_tsd_2 = tokenize_with_miditok(TSD, midi_path_2)

In [46]:
similarity = compare_token_vectors(tokens_midilike_1, tokens_midilike_2)

print(f"Cosine similarity between MIDI tracks: {similarity:.4f}")

Cosine similarity between MIDI tracks: 0.8060


In [48]:
similarity = compare_token_vectors(tokens_tsd_1, tokens_tsd_2)

print(f"Cosine similarity between MIDI tracks: {similarity:.4f}")

Cosine similarity between MIDI tracks: 0.8184
