In [None]:
import time

'''
    语音前置处理,包含分帧、加窗、离散傅里叶变换,将这些合成一个步骤成为短时傅里叶变换+
    语音识别步骤:
    音频-->取样-->分帧-->加窗-->特征提取-->MFCC

'''
#频谱实时显示(动画)
import pyaudio
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal

## 1.开启麦克风,设定收音相关参数
mic = pyaudio.PyAudio()

FORMAT = pyaudio.paInt16 # 16位编码
CHANNELS = 1 # 单声道
RATE = 48000 # 采样率
INTERVAL = 0.32 #缓冲区大小
CHUNK = int(RATE *INTERVAL) #接收区块大小

stream = mic.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,output=True,frames_per_buffer=CHUNK)#开启麦克风

## 2.频谱实时显示
i = 0
while i < 100: #显示100次即停止
    data = stream.read(CHUNK,exception_on_overflow=False)
    data = np.frombuffer(data,dtype='b')

    #绘制频道图
    f,t,Sxx = signal.spectrogram(data,fs=CHUNK)
    dBS = 10 * np.log10(Sxx)
    plt.clf()

    #设定x/y轴标签
    plt.xlabel('Time[sec]')
    plt.ylabel('Frequency[Hz]')

    plt.pcolormesh(t,f,dBS)
    plt.pause(0.001)
    i += 1

## 3.关闭所有装置
stream.stop_stream()
stream.close()
mic.terminate()

In [None]:
'''
    音频前置处理: 利用Librosa函数库了解音频的前置处理程序
'''
import IPython
import pyaudio
import struct
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
import librosa
import soundfile as sf
import librosa.display
from IPython.display import Audio
import time

#1.载入文件
wav_file = '../../sample-files/SAMPLE AUDIO FILES/WAV_1MG.wav'

data,sr = librosa.load(wav_file)
print(f'取样频率={sr},总样本数={data.shape}')

#2.绘制波形
librosa.display.waveshow(data,sr=sr)
plt.show()

print("===========================频谱图=================================")
#3.载入频谱图
##取得梅尔系数
spec = librosa.feature.melspectrogram(y=data,sr=sr)
##转为分贝
db_spec = librosa.power_to_db(spec,ref=np.max)
#显示频谱图
librosa.display.specshow(db_spec,y_axis='mel',fmax=8000,x_axis='s',sr = sr)
plt.colorbar()



print("===========================频谱图=================================")
## 4.存档
sr = 22050 #取样频率
T = 5.0 # 秒
t = np.linspace(0,T,int(T*sr),endpoint=False) # 时间取值范围
x = 0.5 * np.sin(2*np.pi*220*t) # 220Hz的波形

Audio(x,rate=sr)
sf.write('../../sample-files/SAMPLE AUDIO FILES/WAV_1MG_save.wav',x,sr,subtype='PCM_24')

## 5. 进行特征提取,作为深度学习模型的输入

## 6.短时傅里叶变换:包括分帧、加窗、离散傅里叶变换,合并为一个步骤

###返回的矩阵是复数矩阵D,f是频率,t是时间,D[i,j]表示第i帧第j个频率的振幅
print("=====================开始打印短时傅里叶变换=====================")
D = librosa.stft(data)
print(D.shape,D.dtype)


In [None]:
## 7.MFCC,参数nmfcc可指定每秒要传回几个MFCC frame,通常是13或者40
print("=====================开始打印MFCC=====================")
mfcc = librosa.feature.mfcc(y=data,sr=sr,n_mfcc=40)
print(mfcc.shape)

## 8.Log-Mel Spectogram
melspec = librosa.feature.melspectrogram(y=data,sr=sr,n_fft=1024,hop_length=512,n_mels=128)
log_melspec = librosa.power_to_db(melspec,ref=np.max)
print(log_melspec.shape)

## 9.内建音频列表
# example_path = librosa.util.example('brahms')
example_path = librosa.util.example('nutcracker')

## 10.加载librosa默认的内建音频文件
y,sr = librosa.load(example_path)
print(f'取样频率={sr},总样本数={y.shape}')

## 11.播放,利用IPython模块播放音频
Audio(y,rate=sr,autoplay=True)


In [None]:
## 12.指定ID内建音频文件
example_path = librosa.util.example('brahms')
y,sr = librosa.load(example_path)
print(f'取样频率={sr},总样本数={y.shape}')
Audio(y,rate=sr,autoplay=True)

In [None]:
## 13.音频处理与转换
## 14.重取样(Resampling): 从既有的音频重取样,通常是从高质量的取样频率,通过重取样,转换为较低取样频率的数据
sr_new = 11000
y = librosa.resample(y,orig_sr=sr,target_sr=sr_new)
print(len(y),sr_new)
Audio(y,rate=sr_new,autoplay=True)


In [None]:
## 15.将和音与打击音分离:
y_h,y_p = librosa.effects.hpss(y)

#取得梅尔系数
spec_h = librosa.feature.melspectrogram(y=y_h,sr=sr)
spec_p = librosa.feature.melspectrogram(y=y_p,sr=sr)

##转为分贝
db_spec_h = librosa.power_to_db(spec_h,ref=np.max)
db_spec_p = librosa.power_to_db(spec_p,ref=np.max)

plt.subplot(2,1,1)
librosa.display.specshow(db_spec_h,y_axis='mel',fmax=8000,x_axis='s',sr = sr)
plt.colorbar()

plt.subplot(2,1,2)
librosa.display.specshow(db_spec_p,y_axis='mel',fmax=8000,x_axis='s',sr = sr)
plt.colorbar()

plt.tight_layout()

In [None]:
## 16.取得打击音每分钟出现的样本数
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
print(tempo)

## 17.可分别播放和音与打击音
IPython.display.Audio(y_h, rate=sr)

print("==============================")
# IPython.display.Audio(y_p, rate=sr)

In [None]:
## 18.绘制色度图(Chromagram): chroma为半音(semitones),可提取音准(pitch)信息
chroma = librosa.feature.chroma_cqt(y=y_h, sr=sr)
plt.figure(figsize=(18,5))
librosa.display.specshow(chroma,sr=sr,x_axis='time',y_axis='chroma',vmin=0,vmax=1)
plt.title('Chromagram')
plt.colorbar()

plt.figure(figsize=(18,5))
plt.title("Spectrogram")
librosa.display.specshow(chroma,sr=sr,x_axis='s',y_axis='chroma',)

## 20.可任意分离频谱,例如将频谱分为成分(Component),以非负矩阵分解法(NMF)分离频率,NMF类似于主成分分析(PCA)
D = librosa.stft(y)
S,phase = librosa.magphase(D)
componets,activations = librosa.decompose.decompose(S,n_components=8,sort=True)

## 21.显示成分与Activations
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(componets),ref=np.max),y_axis='log')

plt.xlabel('Componet')
plt.ylabel('Frequency')
plt.title('Components')

plt.subplot(1,2,2)
librosa.display.specshow(activations,x_axis='time')
plt.xlabel('Time')
plt.ylabel('Componet')
plt.title('Activations')

plt.tight_layout()

In [None]:
#21 再以分离的Components与Activations重建音频,播放与原曲一致
D_k = componets.dot(activations)

y_k = librosa.istft(D_k * phase)

#And playback
print('Full reconstruction')
IPython.display.Audio(data=y_k,rate=sr)


In [None]:
#22 只以第一Component与Activation重建音频,播放与原曲差异大
k = 0
D_k = np.multiply.outer(componets[:,k],activations[k])

y_k = librosa.istft(D_k * phase)

print('Component #{}'.format(k))
IPython.display.Audio(data=y_k,rate=sr)

In [None]:
#23. Pre-emphasis: 用途为高频加强。人类对高频信号不敏感,所以利用此技巧,补强音频里高频的部分
import matplotlib.pyplot as plt

y,sr = librosa.load(wav_file,offset=30,duration=10)
y_filt = librosa.effects.preemphasis(y)

##比较原音是修正的音频
S_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)),ref=np.max)
S_preemh = librosa.amplitude_to_db(np.abs(librosa.stft(y_filt)),ref=np.max)

#绘图
plt.subplot(2,1,1)
librosa.display.specshow(S_orig,y_axis='log',x_axis='time')
plt.title('Original signal')
plt.subplot(2,1,2)
librosa.display.specshow(S_preemh,y_axis='log',x_axis='time')
fig = plt.title('Pre-emphasized signal')
#图中看到高频已经被补强
plt.tight_layout()

In [None]:
from sklearn.preprocessing import minmax_scale
import librosa
import matplotlib.pyplot as plt

# 24正态化：在导入机器学习模型前,先进行特征缩放,除了能提高准确率外,也能加快优化求解的收敛速度

wav_file = '../../sample-files/SAMPLE AUDIO FILES/WAV_1MG.wav'
data, sr = librosa.load(wav_file, offset=30, duration=10)

plt.subplot(2, 1, 1)
librosa.display.waveshow(data, sr=sr, alpha=0.4)  # 修改此处
plt.title('Original Waveform')

plt.subplot(2, 1, 2)
fig = plt.plot(minmax_scale(data), color='r')
plt.title('Normalized Waveform')

plt.tight_layout()
plt.show()


In [None]:
'''
    特征提取MFCC、Filter bank向量
'''
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc,logfbank

## 1.导入文件
wav_file = '../../sample-files/SAMPLE AUDIO FILES/WAV_1MG.wav'
sr,data = wavfile.read(wav_file)

##2.读取MFCC、Filter bank特征
mfcc_features = mfcc(data,sr)
filter_bank_features = logfbank(data,sr)

##
print('MFCC Shape:',mfcc_features.shape)
print('Filter Bank Shape:',filter_bank_features.shape)

## 3.MFCC、Filter bank特征可视化
plt.subplot(2,1,1)
mfcc_features = mfcc_features.T
plt.imshow(mfcc_features,cmap=plt.cm.jet,extend=[0,mfcc_features.shape[1],0,mfcc_features[0]],aspect='auto')
plt.title('MFCC')

plt.subplot(2,1,2)
filter_bank_features = filter_bank_features.T
plt.imshow(filter_bank_features,cmap=plt.cm.jet,extend=[0,filter_bank_features.shape[1],0,filter_bank_features[0]],aspect='auto')
plt.title('Filter Bank')
plt.tight_layout()
plt.show()