In [1]:
import numpy as np
import librosa as lbrs
import joblib
import warnings

In [2]:
warnings.filterwarnings("ignore")

### 调式倾向量度和谱质心

In [3]:
def correlation(data, series, bias):
    if len(data) != len(series):
        raise IndexError('Lengths not equal')
    else:
        length = len(data)
        avg_x = sum(data) / length
        avg_y = sum(series) / length
        sum_prd_xy = 0
        sum_prd_x2 = 0
        sum_prd_y2 = 0
        for i in range(length):
            j = i + bias
            if j > length - 1:
                j -= length
            sum_prd_xy += (data[j] - avg_x) * (series[i] - avg_y)
            sum_prd_x2 += (data[j] - avg_x) ** 2
            sum_prd_y2 += (series[i] - avg_y) ** 2
        ans = sum_prd_xy / np.sqrt(sum_prd_x2 * sum_prd_y2)
    return ans

In [4]:
def next_max(data, temp, match):
    index = list(data).index(max(temp))
    match.append(index)
    temp.remove(max(temp))

In [5]:
def mode_and_centroid(filename):
    data, rate = lbrs.load(filename)
    n_div = 3
    cqt = np.abs(lbrs.cqt(data, sr=rate, n_bins=84*n_div, bins_per_octave=12*n_div))
    cqt_data = np.array([cqt[n_div * i] for i in range(84)])
    cqt_sum = [sum(cqt_data[i]) for i in range(84)]
    cqt_avg = sum([i * cqt_sum[i] for i in range(84)]) / sum(cqt_sum)
    octave = ['C', 'C#', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'G#', 'A', 'Bb', 'B']
    notes = [octave[i % 12] + str(i // 12) for i in range(84)]
    centr = notes[round(cqt_avg)]
    length = len(cqt[0])
    # 计算每一帧的相同音名的音的振幅总和，结果为总帧数*12的矩阵
    cqt_1oct = []
    for time in range(length):
        cqt_1oct_1frm = np.array([
            sum([
                cqt_data[loc + 12 * octv][time] for octv in range(7)
            ]) for loc in range(12)
        ]) # 计算某一帧同音振幅和
        cqt_1oct.append(cqt_1oct_1frm)
    cqt_1oct = np.array(cqt_1oct)
    # 对每一帧进行筛查
    # 最少两个音即可嵌入一个确定的音阶
    # 到将一整个音阶序列嵌满/无法再嵌入下一个音为止
    N_series = [1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1]
    for time in range(length):
        data = list(cqt_1oct[time])
        temp = list(cqt_1oct[time])
        match_list = []
        next_max(data, temp, match_list)
        next_max(data, temp, match_list)
        for n in range(3, 8):
            inserted = False
            next_max(data, temp, match_list)
            for i in range(12):
                match_num = sum([N_series[match_list[i]] for i in range(n)])
                if match_num == n:
                    inserted = True
                    tonic_loc = i
                    break
            if not inserted:
                match_list.pop()
                break
        for i in range(12):
            if i not in match_list:
                cqt_1oct[time][i] = 0
            else:
                cqt_1oct[time][i] = 1
    # 将所有帧加起来
    sum_octave = [
        sum([
            cqt_1oct[time][loc] for time in range(length)
        ]) for loc in range(12)
    ]
    # 找出主/属音
    sum_sorted = sorted(sum_octave, reverse=True)
    index0 = sum_octave.index(sum_sorted[0])
    index1 = sum_octave.index(sum_sorted[1])
    if abs(index0 - index1) == 7:
        tonic = min(index0, index1)
        domin = max(index0, index1)
    elif abs(index0 - index1) == 5:
        tonic = max(index0, index1)
        domin = min(index0, index1)
    else:
        index10 = (index0 + 7) % 12
        index11 = (index0 + 5) % 12
        index1 = (index10, index11)[int(sum_octave[index10] < sum_octave[index11])]
        tonic, domin = ((index0, index1), (index1, index0))[int(index1 == index11)]
        warnings.warn('Cannot find the most possible dominant, the answer may be inaccurate.', Warning, 2)
    # 利用相关系数求最可能调式调性
    mode_list = [
        'Major',
        'minor (Natural)',
        'minor (Harmonic)'
    ]
    scale_list = []
    scale_list.append([2, 0, 1, 0, 1.5, 1, 0, 2, 0, 1.5, 0, 0.7])
    scale_list.append([2, 0, 1, 1.5, 0, 1, 0, 2, 1.5, 0, 0.7, 0])
    scale_list.append([2, 0, 1, 1.5, 0, 1, 0, 2, 1.5, 0, 0, 0.7])
    note_list = []
    note_list.append(['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B'])
    note_list.append(['C', 'C#', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'G#', 'A', 'Bb', 'B'])
    note_list.append(['C', 'C#', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'G#', 'A', 'Bb', 'B'])
    prbblty_list = [
        [
            correlation(sum_octave, scale_list[n], i)
            for i in range(12)
        ] for n in range(len(mode_list))
    ]
    max_list = [max(prbblty_list[n]) for n in range(len(mode_list))]
    max_mode = max_list.index(max(max_list))
    tonic_index = prbblty_list[max_mode].index(max(prbblty_list[max_mode]))
    if tonic_index not in [tonic, domin]:
        warnings.warn('Cannot judge the most possible tonic, the tonality may be inaccurate.', Warning, 2)
    output_list = [
        note_list[max_mode][tonic_index] + ' ' + mode_list[max_mode],
        max_list[0] - max(max_list[1], max_list[2]),
        centr,
        cqt_avg
    ]
    return output_list

### 节奏律动

In [6]:
def beat_and_tempo(filename):
    offset = 0
    duration = 100
    data, rate = lbrs.load(filename,offset=offset, duration=duration)
    onset_env = lbrs.onset.onset_strength(y=data, sr=rate)
    tempo = lbrs.beat.tempo(onset_envelope=onset_env, sr=rate)
    tempo1, beats = lbrs.beat.beat_track(y=data, sr=rate)
    if tempo[0]>135:
        tempo[0] = tempo[0]/2
    if tempo1 > 135:
        tempo1 = tempo1 / 2
    tempos = []
    tempos.append(tempo)
    tempos.append(tempo1)
    return tempos[0][0]
    # tempos[0]是一个只有一个数据的数组（效果较好）；tempos[1]是一个数据

### 神经网络

In [7]:
def forest(Mode, Sc, Tempo):
    arr = [Mode, Sc, Tempo]
    rfl = joblib.load('rfc.pkl')
    arr = np.array(arr)
    arr = arr.reshape(1, -1)
    result = rfl.predict(arr)
    return result

### 函数调用

In [8]:
def is_audio_file(filename):
    audio_file_extension = ['.mp3', '.wav', '.flac']
    ans = False
    for i in range(len(audio_file_extension)):
        ans = ans or (audio_file_extension[i] in filename)
    return ans

In [9]:
def unify(*args):
    return np.arctan(args) * 2 / np.pi

In [10]:
def EmotionQuadrant(filename):
    if not is_audio_file(filename):
        warnings.warn('Unsupported file extension', Warning, 2)
        return -1
    else:
        knm, mode, sc_note, sc = mode_and_centroid(filename)
        tempo = beat_and_tempo(filename)
        mode_n, sc_n, tempo_n = unify(mode, sc, tempo)
        quadrant = forest(mode_n, sc_n, tempo_n)[0]
        print(
            f'调性:\t\t{knm}\n'
            f'调式倾向:\t{mode}\n'
            f'谱质心:\t\t{sc_note}, {sc}\n'
            f'节奏律动:\t{tempo}\n'
            f'情绪象限:\t{quadrant}'
        )
        return

In [13]:
EmotionQuadrant('Example.wav')

调性:		A minor (Natural)
调式倾向:	-0.16367604898701837
谱质心:		G2, 30.802596007390697
节奏律动:	129.19921875
情绪象限:	3
