<a href="https://colab.research.google.com/github/ShakhovaP/musical-chord-recognition/blob/main/create_dataset_mfcc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import librosa
import numpy as np
import os

In [None]:
from google.colab import files

In [None]:
files.upload()

Saving alldata-mfcc40.json to alldata-mfcc40.json


{'alldata-mfcc40.json': b''}

In [None]:
from chord_transcript import chord_transcriptions,chord_classes

Mounted at /content/gdrive
ALL USED CHORDS

Number of chord classes:  908


A: 1764 ( 4.41% )
A#:7: 2 ( 0.01% )
A#:hdim7: 2 ( 0.01% )
A#:maj(*3): 32 ( 0.08% )
A#:maj(1,*3,*5): 2 ( 0.01% )
A#:min: 47 ( 0.12% )
A#:min(9): 3 ( 0.01% )
A#:min7: 8 ( 0.02% )
A#:min7(b5): 3 ( 0.01% )
A#:sus4: 1 ( 0.0% )
A#:sus4(b7): 2 ( 0.01% )
A/2: 10 ( 0.03% )
A/3: 79 ( 0.2% )
A/4: 2 ( 0.01% )
A/5: 31 ( 0.08% )
A/6: 1 ( 0.0% )
A/7: 2 ( 0.01% )
A/9: 2 ( 0.01% )
A/b6: 1 ( 0.0% )
A/b7: 14 ( 0.04% )
A:(1): 11 ( 0.03% )
A:(1,2,4): 1 ( 0.0% )
A:(1,2,4,5): 44 ( 0.11% )
A:(1,3,b5)/b5: 1 ( 0.0% )
A:(1,5): 9 ( 0.02% )
A:(1,b3,b5,6): 4 ( 0.01% )
A:7: 236 ( 0.59% )
A:7(#9): 7 ( 0.02% )
A:7(*5,13): 1 ( 0.0% )
A:7(13): 4 ( 0.01% )
A:7(b9): 3 ( 0.01% )
A:7/3: 11 ( 0.03% )
A:7/5: 6 ( 0.02% )
A:7/b7: 28 ( 0.07% )
A:9: 10 ( 0.03% )
A:9(11): 1 ( 0.0% )
A:9/b7: 2 ( 0.01% )
A:aug: 24 ( 0.06% )
A:aug/#5: 2 ( 0.01% )
A:dim: 3 ( 0.01% )
A:dim/b3: 1 ( 0.0% )
A:dim/b5: 2 ( 0.01% )
A:dim7: 12 ( 0.03% )
A:dim7/b3: 1 ( 0.0% )
A:hdim7: 

In [None]:
AUDIO_DIR = '/content/gdrive/MyDrive/data/data_wav'
AUDIO_TEST = '/content/gdrive/MyDrive/data/audio_test'
JSON_PATH = '/content/alldata-mfcc40.json'
filename = "05-Beat_It.wav"

In [None]:
SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 512
WINDOW_SIZE = 0.2


########## TRANSCRIPTIONS PROCESSING FUNCTIONS ##########
def calc_duration(transcription):
    """Calculates song duration in transcription file

        :param transcription (list): list of [ChordStartTime, ChordEndTime, ChordLable] for each chord 
        :return (float): ChordEndTime of the last chord in transcription file
    """
    return transcription[-1][1]

def find_transcript(filename, transcriptions):
    """Finds transcription for song name

        :param filename (string): name of the song
        :return (list): transcription for song 
    """
    key = filename[:-4] + ".lab" # replace ".wav" with ".lab"
    return transcriptions[key]

def framing(transcript, frame_size):
    """Cuts song transcription into frames

        :param transcript (list): 
        :param frame_size (float): size of frame in seconds

        :return windowed_chords (list): list of chord labels
    """
    windowed_chords = []
    delta = 0
    for row in transcript:
        chord_duration = row[1] - row[0]
        chord = row[2]

        chord_duration += delta
        windows_pro_chord = int((chord_duration ) // frame_size)
        windowed_chords += [chord]*windows_pro_chord
        delta = chord_duration % frame_size
    return windowed_chords

# def get_classification_vector(chord, mapping):
#     vector = np.zeros(len(mapping))
#     # vector = [0 for _ in range(len(mapping))]
#     index = mapping.index(chord)
#     vector[index] = 1
#     return vector

def get_chord_class_number(chord, mapping):
    """Is used to replace chord label(string) with int

        :param chord (string): chord label
        :param mapping (list): list of chord labels 

        :return (int): index of input chord label in mapping
    """
    return mapping.index(chord)


############### AUDIO PROCESSING FUNCTIONS ###############
def load_audio(filename, sample_rate, transcript):
    """Load audiofile

        :param filename (np.array): audiofile
        :param sample_rate (int): sample rate of audiofile
        :param transcript (float): 

        :return signal (np.array): array of samples
    """
    tr_duration = calc_duration(transcript)
    print('Transcript duration: ', tr_duration)
    signal, sr = librosa.load(filename, sr=sample_rate, duration=tr_duration)
    audio_duration = librosa.get_duration(signal, sr)
    print('Audio duration: ', audio_duration)
    return signal

def cut_into_frames(signal, frame_duration, sample_rate):
    """Cuts audiofile into small frames

        :param signal (np.array): array of samples
        :param frame_duration (float): choosen duration of each frame (in seconds)
        :param sample_rate (int): number of samples per second in audio signal

        :return cut_signal (np.array): array of equal-sized frames where each frame is array of samples
    """
    frame_length = frame_duration * sample_rate
    number_of_frames = int(len(signal) // frame_length)
    cut_signal_length = int(number_of_frames * frame_length)
    cut_signal = np.array(signal[:cut_signal_length]).reshape(number_of_frames, -1)
    return cut_signal

def calculate_MFCCs(cut_signal, sample_rate, n_fft, hop_length): 
    """Calculates MFCCs for each frame of input signal

        :param signal (np.array): audiofile
        :param sample_rate (int): sample rate of audiofile
        :param n_fft (int): 
        :param hop_length (int): 

        :return mfcc (np.array): array of MFCC-matrix for each frame of input signal
    """
    mfccs = [librosa.feature.mfcc(s, 
                                  sr=sample_rate, 
                                  n_fft=n_fft, 
                                  hop_length=hop_length, 
                                  n_mfcc=40).T.tolist() for s in cut_signal]
    # melspec = [librosa.feature.melspectrogram(y=s, 
    #                                           sr=sample_rate, 
    #                                           n_fft=n_fft, 
    #                                           hop_length=hop_length).tolist() for s in signal]
    return mfccs
    # return melspec


# signal, sr = librosa.load(filename, sr=44100)
# cs = cut_into_frames(signal, 0.3, 44100)
# mfcc = calculate_MFCCs(cs, 44100, 2048, 512)
# print(type(mfcc[0][0]), len(mfcc[0][0]))
# print(chord_classes)
# print(list(chord_transcriptions.values())[0])



        



In [None]:
def create_dataset(transcripts, chord_classes, audiodir, sample_rate, window_size, n_fft, hop_length):
    """Saves dataset into JSON-file

        :param transcripts (list): list of chord transcription lists of each song
        :param chord_classes (list): list of all possible chord classes  
        :param audiodir (string): name of the directory with songs audio files
        :param sample_rate (int): number of samples per second in audio signal
        :param window_size (float): duration of each frame (in which the input signal will be cut)
        :param n_fft (float): 
        :param hop_length (float): 

        :return dataset (dictionary): dictionary with fields 
                                            "mapping" (the list of chord classes), 
                                            "MFCCs" (the list of MFCC matrixes of each frame for each song), 
                                            "chords" (the list of framed chords (not names of the chords but indexes from "mapping"-field)
                                                      for each song)
    """
    
    dataset = {
        "mapping": [],
        "MFCCs": [],
        "chords": [],
    }

    dataset["mapping"] = chord_classes

    count = 1
    for _file in os.scandir(audiodir):
        filename = os.path.basename(_file)
        print(f'\n {count} Processing {filename}')
        count += 1
        if _file.is_file():
            transcript = find_transcript(filename, transcripts)
            signal = load_audio(_file, sample_rate, transcript)

            for row in transcript:
                # row[2] = get_classification_vector(row[2], chord_classes)
                row[2] = get_chord_class_number(row[2], chord_classes)

            w_transcript = framing(transcript, window_size)
            w_signal = cut_into_frames(signal, window_size, sample_rate)
            while len(w_transcript) != len(w_signal):
                print(len(w_transcript), '!=', len(w_signal), 
                    'Number of transcript windows is not equal to the number of audio windows.')
                l = min(len(w_transcript), len(w_signal))
                w_transcript = w_transcript[:l-1]
                w_signal = w_signal[:l-1]
            
            mfccs = calculate_MFCCs(w_signal,sample_rate, n_fft, hop_length)

            dataset["chords"] += w_transcript
            dataset["MFCCs"] += mfccs
    return dataset

chords_dataset = create_dataset(
    transcripts=chord_transcriptions,
    chord_classes=chord_classes,
    audiodir=AUDIO_DIR,
    sample_rate=SAMPLE_RATE,
    window_size=WINDOW_SIZE,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH  
)

In [None]:
def save_json(json_path, data):
    """Saves dataset into JSON-file

        :param json_path (string): name of the file where the data will be saved
        :param data (dictionary): data that will be saved
            
        :return
    """
    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)
    print('\nJSON saved!\n')


In [None]:
save_json(json_path=JSON_PATH, data=chords_dataset)
# test_trans = dict({
#     "01_-_A_Hard_Day's_Night.lab": chord_transcriptions["01_-_A_Hard_Day's_Night.lab"],
#     "01_-_Come_Together.lab": chord_transcriptions["01_-_Come_Together.lab"],
# })

# test_dataset = create_dataset(
#     transcripts=test_trans,
#     chord_classes=chord_classes,
#     audiodir=AUDIO_TEST,
#     sample_rate=SAMPLE_RATE,
#     window_size=WINDOW_SIZE,
#     n_fft=N_FFT,
#     hop_length=HOP_LENGTH  
# )


JSON saved!

