In [10]:
"""
Provide features used in speaker separation. 
Provides: MFCC

Author: Ren Yuan (Peter) Xue
"""

import numpy as np
import functions as func
from scipy.fftpack import dct

def mfcc(signal, sample_rate=None, pre_emph=0.97, window_len=0.025, hop_size=0.01,
         num_fft=None, freq_min=0, freq_max=None, num_mel_filter=26, norm=Falsfe):
    """
    Function mfcc calculates the MFCC feature from an audio signal.
    
    @param signal: Audio signal.
    @param sample_rate: The sample rate of audio signal we are working with.
    @param pre_emph: Coefficient used in pre-empahsis filter. Default is 0.97.
    @param window_len: Time interval we are taking within frames. Default is 0.025.
    @param hop_size: Time step we are taking between frames. Default is 0.01.
    @param num_fft: Number of FFT points. Default is calculated using get_num_fft.
    @param freq_min: Lowest frequency band edge of Mel filters in Hz. Default is 0.
    @param freq_max: Highest frequency band edge of Mel filters in Hz. Default is sample rate / 2.
    @param num_mel_filter: Number of filter points in filter banks on Mel scale.
    @param norm: Whether or not perform mean normalization. Default is False.
    @returns: The MFCC feature from audio signal.
    """
    # Handle exceptions.
    if not sample_rate: # Check samplerate input validness.
        return('Invalid input for sample_rate')
    if freq_max  and freq_max > sample_rate / 2: # Check maximum frequency input validness.
        return('Invalid input for freq_max')
    filter_banks = logfbank(signal, sample_rate, pre_emph, window_len, hop_size,
                              num_fft, freq_min, freq_max, num_mel_filter, False)
    mfcc = dct(filter_banks, type = 2, axis = 1, norm = 'ortho')
    return mfcc if not norm else mfcc - np.mean(mfcc, axis = 0) + 1e-8
    
    
def fbank(signal, sample_rate=None, pre_emph=0.97, window_len=0.025, hop_size=0.01,
         num_fft=None, freq_min=0, freq_max=None, num_mel_filter=26, norm=False):
    """
    Function fbank calculates the filter bank feature from an audio signal.
    
    @param signal: Audio signal.
    @param sample_rate: The sample rate of audio signal we are working with.
    @param pre_emph: Coefficient used in pre-empahsis filter. Default is 0.97.
    @param window_len: Time interval we are taking within frames. Default is 0.025.
    @param hop_size: Time step we are taking between frames. Default is 0.01.
    @param num_fft: Number of FFT points. Default is calculated using get_num_fft.
    @param freq_min: Lowest frequency band edge of Mel filters in Hz. Default is 0.
    @param freq_max: Highest frequency band edge of Mel filters in Hz. Default is sample rate / 2.
    @param num_mel_filter: Number of filter points in filter banks on Mel scale.
    @param norm: Whether or not perform mean normalization. Default is False.
    @returns: The filter bank feature from audio signal.
    """
    # Handle exceptions.
    if not sample_rate: # Check samplerate input validness.
        return('Invalid input for sample_rate')
    if freq_max  and freq_max > sample_rate / 2: # Check maximum frequency input validness.
        return('Invalid input for freq_max')
    
    # Initialze variables.
    num_fft = num_fft or func.get_num_fft(sample_rate, window_len)
    freq_max = freq_max or int(np.floor(sample_rate / 2))
    # Apply pre-emphasize filter to audio.
    emphasized_signal = func.pre_emphasis(signal, pre_emph)
    # Calculate the power spectrum of the audio.
    power_spectrum, _ = func.powspec(emphasized_signal, sample_rate, window_len, hop_size, num_fft)
    # Construct filter_banks.
    filters = func.get_filter(freq_min,freq_max,num_mel_filter,num_fft,sample_rate) # Construct filters.
    filter_banks = np.dot(filters, power_spectrum.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical stability
    # Return the filter bank based on mean normalization = True or False.
    return filter_banks if not norm else filter_banks - np.mean(filter_banks, axis = 0) + 1e-8 


def logfbank(signal, sample_rate=None, pre_emph=0.97, window_len=0.025, hop_size=0.01,
         num_fft=None, freq_min=0, freq_max=None, num_mel_filter=26, norm=False):
    """
    Function logfbank calculates the filter bank feature from an audio signal.
    
    @param signal: Audio signal.
    @param sample_rate: The sample rate of audio signal we are working with.
    @param pre_emph: Coefficient used in pre-empahsis filter. Default is 0.97.
    @param window_len: Time interval we are taking within frames. Default is 0.025.
    @param hop_size: Time step we are taking between frames. Default is 0.01.
    @param num_fft: Number of FFT points. Default is calculated using get_num_fft.
    @param freq_min: Lowest frequency band edge of Mel filters in Hz. Default is 0.
    @param freq_max: Highest frequency band edge of Mel filters in Hz. Default is sample rate / 2.
    @param num_mel_filter: Number of filter points in filter banks on Mel scale.
    @param norm: Whether or not perform mean normalization. Default is False.
    @returns: The log filter bank feature from audio signal.
    """
    # Handle exceptions.
    if not sample_rate: # Check samplerate input validness.
        return('Invalid input for sample_rate')
    if freq_max  and freq_max > sample_rate / 2: # Check maximum frequency input validness.
        return('Invalid input for freq_max')
    
    # Initialze variables.
    num_fft = num_fft or func.get_num_fft(sample_rate, window_len)
    freq_max = freq_max or int(np.floor(sample_rate / 2))
    # Compute filter bank from function filter_bank.
    filter_banks = fbank(signal, sample_rate, pre_emph, window_len, hop_size,
                              num_fft, freq_min, freq_max, num_mel_filter, False)
    filter_banks = np.log(filter_banks) # Take log.
    return filter_banks if not norm else filter_banks - np.mean(filter_banks, axis = 0) + 1e-8


def rasta_plp(signal, window_len=0.025, hop_size=0.010, sample_rate=None, dorasta=True, model_order=8):
    """
    Function rasta_plp calculates the RASTA-PLP feature from an audio signal.
    
    @param signal: Audio signal.
    @param sample_rate: The sample rate of audio signal we are working with.`
    @param dorasta: Perform dorasta or not. Default is True. 
    @param model_order: Order of the model. Default is 8. 
    @returns: The RASTA-PLP feature from audio signal.
    """
    # Handle exceptions.
    if not sample_rate: # Check samplerate input validness.
        return('Invalid input for sample_rate')
    # Initialze variables.
    num_fft = func.get_num_fft(sample_rate, window_len)
    power_spectrum, _ = func.powspec(signal, sample_rate, window_len, hop_size, num_fft)
    power_spectrum = power_spectrum.T
    aspectrum = func.audspec(power_spectrum, sample_rate)
    num_bands = aspectrum.shape[0]
    
    if dorasta == True:
        log_aspectrum = np.log(aspectrum) # Put in log domain.
        ras_log_aspectrum = func.rasta_filter(log_aspectrum) #  Next, do RASTA filtering.
        aspectrum = np.exp(ras_log_aspectrum) # Do inverse log.
        
    # Do final auditory compressions
    post_spectrum, _ = func.postaud(aspectrum, sample_rate/2) 
    
    if model_order > 0:
        # LPC analysis.
        lpcas = func.dolpc(post_spectrum, model_order)
        # Convert lpc to cepstra.
        cepstra = func.lpc2cep(lpcas, model_order + 1)
        # Convert lpc to spectra
        spectra, F, M = func.lpc2spec(lpcas, num_bands)
    else:
        # No LPC smoothing of spectrum
        spectra = postspectrum
        cepstra = func.spec2cep(spectra)
        
    cepstra = func.lifter(cepstra, 0.6)
    return cepstra, spectra

In [None]:
def main():
    print('hello world')
if __name__ == '__main__':
    main()