In [3]:
import librosa
import pickle
import numpy as np

# Load their tuned model from file
with open("model/their_tuned_model.pkl", 'rb') as file:
    pickle_model = pickle.load(file)

# Load the standard scaler that was used to train their model
from joblib import load
scaler = load('model/std_scaler.bin')

instr_list = ["cel", "cla", "flu", "gac", "gel", "org", "pia", "sax", "tru", "vio", "voi"]

def score_prob_of_being(y, sr=44100, being=None, sim=False):
    rms = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{np.mean(rms)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
    for e in mfcc:
        to_append += f' {np.mean(e)}'

    df = np.array([[float(x) for x in to_append.split(" ")]])
    
    X = scaler.transform(df)
    
    if being:
        mse = np.sqrt(np.mean(np.power(y - orig_y, 2))) if sim else 0
        return pickle_model.predict_proba(X)[0][instr_list.index(being)] - mse
    else:
        return pickle_model.predict_proba(X)[0]

In [5]:
songname = "dataset/IRMAS-TrainingData/flu/008__[flu][nod][cla]0393__1.wav"
orig_y, sr = librosa.load(songname, sr=44100)
orig_y = np.real(librosa.istft(np.real(librosa.stft(orig_y))))
print("Probability of being what it actually is:", score_prob_of_being(orig_y, sr, "flu"))
tricked_class = "tru"
print("Probability of being what we want it to be:", score_prob_of_being(orig_y, sr, "tru"))

Probability of being what it actually is: 0.8309308287903415
Probability of being what we want it to be: 0.014977228027226173


In [6]:
import numpy as np
from deap import base, creator, tools, algorithms

# Define the problem: Maximizing the fitness of the spectrogram
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", np.ndarray, fitness=creator.FitnessMax)

# Define the functions for initialization, mutation, crossover, and evaluation
def init_individual():
    y, sr = librosa.load(songname, sr=44100)
    return np.real(librosa.stft(y))

def mutate(individual, indpb):
    # Mutate an individual by adding a small random value to each element
    individual += np.random.normal(0, indpb, individual.shape)
    return individual,

def crossover(parent1, parent2):
    # Perform crossover by taking the average of corresponding elements
    return (parent1 + parent2) / 2,

def evaluate(individual):
    # Score the individual based on the probability of the model to predict it as what we want it to be
    return score_prob_of_being(np.real(librosa.istft(individual)), 44100, tricked_class, True),

# Set up the DEAP framework
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, init_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", mutate, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Create an initial population
population_size = 25
population = toolbox.population(n=population_size)

# Run the evolution
generations = 50
stats = tools.Statistics(key=lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("max", np.max)
population, logbook = algorithms.eaMuPlusLambda(population, toolbox, mu=population_size, lambda_=population_size*2,
                                                cxpb=0.7, mutpb=0.2, ngen=generations, stats=stats, halloffame=None, verbose=True)

gen	nevals	avg      	max      
0  	25    	0.0149772	0.0149772
1  	41    	0.0471965	0.115838 
2  	48    	0.105606 	0.131907 
3  	45    	0.126377 	0.133736 
4  	40    	0.131634 	0.134289 
5  	43    	0.133821 	0.134437 
6  	46    	0.134424 	0.135135 
7  	45    	0.134641 	0.135135 
8  	49    	0.134829 	0.135023 
9  	45    	0.134877 	0.135173 
10 	45    	0.135079 	0.135559 
11 	49    	0.135397 	0.135979 
12 	44    	0.135564 	0.136139 
13 	46    	0.135809 	0.136215 
14 	42    	0.136072 	0.136574 
15 	45    	0.136268 	0.136574 
16 	43    	0.136491 	0.136746 
17 	48    	0.136593 	0.136746 
18 	45    	0.136634 	0.136746 
19 	44    	0.136666 	0.136711 
20 	45    	0.136701 	0.136752 
21 	46    	0.136721 	0.136752 
22 	47    	0.136734 	0.136752 
23 	43    	0.136747 	0.136752 
24 	46    	0.136753 	0.136767 
25 	42    	0.136755 	0.136767 
26 	46    	0.136757 	0.136767 
27 	42    	0.136762 	0.136775 
28 	45    	0.136765 	0.136777 
29 	48    	0.136768 	0.136777 
30 	43    	0.136771 	0.136777 
31 	45  

In [7]:
# Access the best individual after evolution
best_individual = tools.selBest(population, k=1)[0]
best_fitness = best_individual.fitness.values[0]

print("Best Individual:", best_individual)
print(score_prob_of_being(np.real(librosa.istft(best_individual)), 44100))
print("Best Fitness:", best_fitness)

Best Individual: [[-3.68430078e-01 -1.77438602e-01 -1.18702985e-01 ... -3.25560861e-04
  -4.99743037e-02 -1.15106654e+00]
 [ 5.99424958e-01 -1.19272284e-01  7.03507140e-02 ... -1.79141447e-01
  -7.04798624e-02  1.01681912e+00]
 [-6.05305910e-01  3.22536319e-01 -1.56603888e-01 ...  1.52425855e-01
   2.50230640e-01 -3.95623684e-01]
 ...
 [ 6.75624087e-02  1.01171300e-01 -1.29502445e-01 ... -5.11268117e-02
   1.85664788e-01 -5.53653464e-02]
 [-8.23182911e-02  5.58651239e-02 -8.38580877e-02 ...  1.92378208e-01
   2.49678195e-02  1.00815549e-01]
 [-2.64724307e-02 -3.28566600e-03  2.81627998e-02 ...  4.53758836e-02
   5.55654876e-02 -6.00790046e-02]]
[0.03137257 0.12997455 0.14087493 0.02847595 0.09121208 0.11079457
 0.03844829 0.10446684 0.1382359  0.1231277  0.06301661]
Best Fitness: 0.1367767014494711


In [8]:
from IPython.lib.display import Audio
from IPython.display import display

orig, sr = librosa.load(songname, sr=44100)
print(orig, sr)
modi = librosa.istft(best_individual)
display(Audio(data=orig, rate=sr))
display(Audio(data=modi, rate=sr))

[ 0.0753479   0.07347107  0.06999207 ... -0.00190735  0.00346375
  0.00808716] 44100


KeyboardInterrupt: 