In [1]:
import torch
import torch.nn as nn

class VariationModule(nn.Module):
    def __init__(self):
        super(VariationModule, self).__init__()
        self.fc_timing = nn.Linear(10, 10)  # Adjust timing
        self.fc_velocity = nn.Linear(10, 10)  # Adjust velocity
        self.fc_timbre = nn.Linear(10, 10)  # Adjust timbre

    def forward(self, x):
        timing_variation = torch.tanh(self.fc_timing(x)) * 0.05  # Small timing shifts
        velocity_variation = torch.tanh(self.fc_velocity(x)) * 0.1  # Small velocity changes
        timbre_variation = torch.tanh(self.fc_timbre(x)) * 0.1  # Small timbral adjustments
        return timing_variation, velocity_variation, timbre_variation

# Integration with the main model
class MusicModel(nn.Module):
    def __init__(self):
        super(MusicModel, self).__init__()
        self.fc = nn.Linear(10, 10)
        self.variation_module = VariationModule()

    def forward(self, x):
        x = self.fc(x)
        timing_variation, velocity_variation, timbre_variation = self.variation_module(x)
        return x + timing_variation, velocity_variation, timbre_variation


In [2]:
class HumanPerformanceModel(nn.Module):
    def __init__(self):
        super(HumanPerformanceModel, self).__init__()
        self.fc = nn.Linear(10, 3)  # Output timing, velocity, timbre variations

    def forward(self, x):
        return self.fc(x)

performance_model = HumanPerformanceModel()
variations = performance_model(input_data)
timing_variation, velocity_variation, timbre_variation = variations.split(1, dim=-1)


NameError: name 'input_data' is not defined

In [3]:
def custom_loss(predicted, target, timing_variation, velocity_variation, timbre_variation):
    intonation_loss = torch.nn.functional.mse_loss(predicted, target)
    variation_penalty = torch.mean(timing_variation**2) + torch.mean(velocity_variation**2) + torch.mean(timbre_variation**2)
    return intonation_loss + 0.1 * variation_penalty  # Adjust weight as needed

output, timing_variation, velocity_variation, timbre_variation = model(input_data)
loss = custom_loss(output, target_data, timing_variation, velocity_variation, timbre_variation)


NameError: name 'model' is not defined

In [4]:
def post_process(output, timing_variation, velocity_variation, timbre_variation):
    timing_variation = torch.clamp(timing_variation, -0.05, 0.05)  # Limit timing shifts
    velocity_variation = torch.clamp(velocity_variation, -0.1, 0.1)  # Limit velocity changes
    timbre_variation = torch.clamp(timbre_variation, -0.1, 0.1)  # Limit timbral adjustments
    return output + timing_variation, velocity_variation, timbre_variation

final_output = post_process(output, timing_variation, velocity_variation, timbre_variation)


NameError: name 'output' is not defined

In [1]:
!pip install music21 pydub midi2audio
!apt-get install fluidsynth

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting midi2audio
  Downloading midi2audio-0.1.1-py2.py3-none-any.whl (8.7 kB)
Installing collected packages: pydub, midi2audio
Successfully installed midi2audio-0.1.1 pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin libinput10
  libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5
  libqt5svg5 libqt5widgets5 libwacom-bin libwacom-common libwacom9 libxcb-icccm4 libxcb-image0
  libxcb-keysyms1 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 qsynth qt5-gtk-platformtheme qttranslations5-l10n timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs qt5-image-formats-plugins qtwayland5 jackd
The following NEW packages will be installed:
  flui

In [16]:
from music21 import converter, midi
from pydub import AudioSegment
from midi2audio import FluidSynth
import os
import music21
import librosa
from scipy.spatial.distance import cosine

def similarity_to_synth(mxl_file, soundfont_path, input_mp3_file, output_mp3='temp.mp3'):
    def mxl_to_violin_mp3(mxl_file, output_mp3, soundfont_path):
    # Step 1: Parse MXL to MIDI
        score = music21.converter.parse(mxl_file)
        midi_file = os.path.splitext(mxl_file)[0] + '.mid'
        score.write('midi', fp=midi_file)

        # Step 2: Synthesize MIDI to audio using FluidSynth
        fs = FluidSynth(soundfont_path)
        wav_file = os.path.splitext(mxl_file)[0] + '.wav'
        fs.midi_to_audio(midi_file, wav_file)

        # Step 3: Convert WAV to MP3
        audio = AudioSegment.from_wav(wav_file)
        audio.export(output_mp3, format='mp3')

        # Clean up temporary files
        os.remove(midi_file)
        os.remove(wav_file)

    def compare_intonation(human_mp3_file, synthesized_mp3_file):
        def extract_pitch_features(mp3_file):
            y, sr = librosa.load(mp3_file)
            # Extract Mel-frequency cepstral coefficients (MFCCs)
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            return mfcc
        # Extract pitch features (MFCCs) from both MP3 files
        mfcc_human = extract_pitch_features(human_mp3_file)
        mfcc_synthesized = extract_pitch_features(synthesized_mp3_file)

        # Flatten MFCC matrices to 1D arrays for cosine similarity calculation
        mfcc1 = mfcc_human.flatten()
        mfcc2 = mfcc_synthesized.flatten()

        # Compute cosine similarity between flattened MFCC arrays
        similarity = 1 - cosine(mfcc1, mfcc2)

        # Ensure similarity is between 0 and 1
        similarity = max(0, min(similarity, 1))

        return similarity

    # def compare_mp3_similarity(mp3_file1, mp3_file2):
    #     # Load and extract MFCCs from the first MP3 file (synthesized)
    #     y1, sr1 = librosa.load(mp3_file1)
    #     mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)

    #     # Load and extract MFCCs from the second MP3 file (human-recorded)
    #     y2, sr2 = librosa.load(mp3_file2)
    #     mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)

    #     # Ensure both arrays have the same shape by taking the minimum length
    #     min_frames = min(mfcc1.shape[1], mfcc2.shape[1])
    #     mfcc1 = mfcc1[:, :min_frames]
    #     mfcc2 = mfcc2[:, :min_frames]

    #     # Compute cosine similarity between the MFCCs
    #     similarity = 1 - cosine(mfcc1.flatten(), mfcc2.flatten())
    #     return similarity

    # def compare_mp3_similarity(file1, file2):
    #     # Load audio files
    #     y1, sr1 = librosa.load(file1)
    #     y2, sr2 = librosa.load(file2)

    #     # Extract features (MFCC)
    #     mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
    #     mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)

    #     # Compute cosine similarity between MFCCs
    #     similarity = 1 - cosine(mfcc1.flatten(), mfcc2.flatten())

    #     return similarity

    mxl_to_violin_mp3(mxl_file, output_mp3, soundfont_path)

    accuracy = compare_intonation(output_mp3, input_mp3_file)
    os.remove(output_mp3)
    return accuracy


In [17]:
mxl_file = '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Accuracy Training Data/Test.mxl'
soundfont_path = '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Violin Synth Soundfont/Quartz2.sf2'
input_mp3_file = '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Accuracy Training Data/Test.mp3'
print(similarity_to_synth(mxl_file, soundfont_path, input_mp3_file))
for i in range(10):
  input_mp3_file = f'/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human_{i}.mp3'
  print(similarity_to_synth(mxl_file, soundfont_path, input_mp3_file))

for i in range(10):
  input_mp3_file = f'/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/MIDI_{i}.mp3'
  print(similarity_to_synth(mxl_file, soundfont_path, input_mp3_file))

ValueError: operands could not be broadcast together with shapes (106480,) (119020,) 