Imports

In [87]:
from __future__ import print_function
import os
import numpy as np
import sys
import subprocess
import tarfile
import xml.etree.ElementTree as ET
from IPython.display import display, Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from six.moves import range
from yaafelib import *


In [2]:
#print(os.listdir(os.getcwd()))
test_track = 'data/genres/hiphop/hiphop.00049.au'


In [84]:
class IsmisFeatures:
    
    def __init__(self, audiofile):
        self.audiofile = audiofile
        
        self.temporal_centroid = None      #1: Temporal Centroid
        self.spectral_centroid_avg = None  #2: Spectral Centroid average value
        self.spectral_centroid_var = None  #3: Spectral Centroid variance
        self.ase_per_band_avg = []         #4-37: Audio Spectrum Envelope (ASE) average values in 34 frequency bands
        self.ase_avg = None                #38: ASE average value (averaged for all frequency bands)
        self.ase_per_band_var = []         #39-72: ASE variance values in 34 frequency bands
        self.ase_var_avg = None              #73: averaged ASE variance parameters
        self.centroid_avg = None           #74: Audio Spectrum Centroid – average
        self.centroid_var = None           #75: Audio Spectrum Centroid – variance
        self.spread_avg = None             #76: Audio Spectrum Spread – average
        self.spread_var = None             #77: Audio Spectrum Spread – variance
        self.sfm_per_band_avg = []         #78-101: Spectral Flatness Measure (SFM) average values for 24 frequency bands
        self.sfm_avg = None                #102: SFM average value (averaged for all frequency bands)
        self.sfm_per_band_var = []         #103-126: Spectral Flatness Measure (SFM) variance values for 24 frequency bands
        self.sfm_var_avg = None            #127: averaged SFM variance parameters
        self.mfcc = []                     #128-147: 20 first mel cepstral coefficients average values
        
    def extract_features(self):
        self.extract_mpeg7_features()
        self.extract_mfcc()
        
    def extract_mpeg7_features(self):
        ns = {'xmlns' : 'urn:mpeg:mpeg7:schema:2001',
              'mpeg7' : 'urn:mpeg:mpeg7:schema:2001',
              'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
              'xsi:schemaLocation' : 'urn:mpeg:mpeg7:schema:2001 http://mpeg7audioenc.sourceforge.net/mpeg7audioenc.xsd'} 

        subprocess.call(['java', '-jar', 'MPEG7AudioEnc.jar', self.audiofile, 'mpeg7config.xml'], stdout='desc.xml')
        tree = ET.parse('desc.xml')
        root = tree.getroot()

        audio_spectrum_centroid = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumCentroidType']", ns)
        audio_spectrum_spread = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumSpreadType']", ns)
        audio_spectrum_envelope = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumEnvelopeType']", ns)
        audio_spectrum_flatness = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumFlatnessType']", ns)

        envelope_values = self.parse_2d_xml_vector(audio_spectrum_envelope)          
        self.ase_per_band_avg = [np.mean(band) for band in envelope_values]
        self.ase_avg = np.mean(self.ase_per_band_avg)
        self.ase_per_band_var = [np.var(band) for band in envelope_values]
        self.ase_var_avg = np.mean(self.ase_per_band_var)
        
        centroid_values = self.parse_xml_vector(audio_spectrum_centroid)
        self.centroid_avg = np.mean(centroid_values)
        self.centroid_var = np.var(centroid_values)
        
        spread_values = self.parse_xml_vector(audio_spectrum_spread)
        self.spread_avg = np.mean(spread_values)
        self.spread_var = np.var(spread_values)
        
        flatness_values = self.parse_2d_xml_vector(audio_spectrum_flatness)
        self.sfm_per_band_avg = [np.mean(band) for band in flatness_values]
        self.sfm_avg = np.mean(self.sfm_per_band_avg)
        self.sfm_per_band_var = [np.var(band) for band in flatness_values]
        self.sfm_var_avg = np.mean(self.ase_per_band_var)
     
    def parse_xml_vector(self, element):
        values_splitted = element.find(".//mpeg7:Raw", ns).text.split()
        return map((lambda x: float(x)), values_splitted)
    
    def parse_2d_xml_vector(self, element):
        values_string = element.find(".//mpeg7:Raw", ns).text
        values_splitted = [s.strip().split() for s in values_string.splitlines()]
        values = [map((lambda x: float(x)), value) for value in values_splitted] #cast to float
        transposed_values = list(map(list, zip(*values))) #transpose values to have bands vectors
        return transposed_values 
    
    def extract_mfcc(self):
        fp = FeaturePlan(sample_rate=22050, normalize=1)
        fp.addFeature('mfcc: MFCC CepsNbCoeffs=20')
        df = fp.getDataFlow()
        engine = Engine()
        engine.load(df)
        afp = AudioFileProcessor()
        afp.setOutputFormat('csv', 'features', {'Precision': '8', 'Metadata': 'False'})
        afp.processFile(engine, self.audiofile)
        engine.flush()
        feats = engine.readAllOutputs()
        print(feats['mfcc'])
        return feats

In [86]:
ismisFeature = IsmisFeatures(test_track)
ismisFeature.extract_features()

#print(ismisFeature.ase_per_band_avg)

#print(vars(ismisFeature))

[[ 0.06149227  1.59239392  1.31847123  0.20475314  0.35536394 -0.16618184
   0.61475064  0.66483616  0.04542616  0.05377903 -0.09706389 -0.02961214
   0.02836009  0.07020499  0.08470754 -0.01626443  0.25405867  0.15249978
  -0.2412863  -0.25502605]]


In [81]:
def get_ismiss_feature_plan():
    fp = FeaturePlan(sample_rate=22050, normalize=1)
    # Features that seems to be most often used, so they are good to start with.
    #1 - Temporal Centroid [0]
    #2 - Spectral Centroid (3?)
    fp.addFeature('mfcc: MFCC CepsNbCoeffs=20')
    return fp

def get_features(audio_file, feature_plan):
    if os.path.exists(audio_file):
        print('Getting features from ' + audio_file)
    else:
        raise Exception('File ' + audio_file + ' not found')
    df = feature_plan.getDataFlow()
    engine = Engine()
    engine.load(df)
    afp = AudioFileProcessor()
    afp.setOutputFormat('csv', 'features', {'Precision': '8', 'Metadata': 'False'})
    afp.processFile(engine, audio_file)
    engine.flush()
    feats = engine.readAllOutputs()
    return feats

fp = get_ismiss_feature_plan()
features = get_features(test_track, fp)

print(features)
for i in features:
    print(i, features[i].shape)

Getting features from data/genres/hiphop/hiphop.00049.au
{'mfcc': array([[ 0.06149227,  1.59239392,  1.31847123,  0.20475314,  0.35536394,
        -0.16618184,  0.61475064,  0.66483616,  0.04542616,  0.05377903,
        -0.09706389, -0.02961214,  0.02836009,  0.07020499,  0.08470754,
        -0.01626443,  0.25405867,  0.15249978, -0.2412863 , -0.25502605]])}
mfcc (1, 20)


In [30]:
def get_avg_and_var_value(element):
    values_splitted = element.find(".//mpeg7:Raw", ns).text.split()
    values = map((lambda x: float(x)), values_splitted)
    avg = np.mean(values)
    var = np.var(values)
    return [avg, var]

def parse_2d_xml_vector(element):
    values_string = element.find(".//mpeg7:Raw", ns).text
    values_splitted = [s.strip().split() for s in values_string.splitlines()]
    values = [map((lambda x: float(x)), value) for value in values_splitted]
    print(values)

def get_mpeg7_features():
    ns = {'xmlns' : 'urn:mpeg:mpeg7:schema:2001',
          'mpeg7' : 'urn:mpeg:mpeg7:schema:2001',
          'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
          'xsi:schemaLocation' : 'urn:mpeg:mpeg7:schema:2001 http://mpeg7audioenc.sourceforge.net/mpeg7audioenc.xsd'} 

    tree = ET.parse('desc.xml')
    root = tree.getroot()

    audio_spectrum_centroid = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumCentroidType']", ns)
    audio_spectrum_spread = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumSpreadType']", ns)
    audio_spectrum_envelope = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumEnvelopeType']", ns)
    audio_spectrum_flatness = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumFlatnessType']", ns)
    
    parse_2d_xml_vector(audio_spectrum_envelope)
    
    #audio_spectrum_centroid_values = get_avg_and_var_value(audio_spectrum_centroid)
    #audio_spectrum_spread_values = get_avg_and_var_value(audio_spectrum_spread)


get_mpeg7_features()

[[0.00013159691, 3.677493e-05, 1.4876396e-05, 1.4634449e-05, 1.5984611e-05, 1.4689602e-05, 2.2926948e-05, 4.5766603e-05, 3.232223e-05, 5.265978e-05, 1.5929498e-05, 0.00013207793, 3.124088e-05, 1.5872592e-05, 1.7165137e-05, 6.843315e-06, 3.9190936e-06, 6.5794725e-06, 3.5637502e-06, 3.0114128e-05, 9.253123e-05, 9.179908e-06, 9.214595e-06, 1.4721468e-05, 3.66119e-06, 1.000826e-05, 2.716277e-06, 2.7227363e-06, 3.3541664e-06, 1.1244866e-06, 4.2207193e-09, 0.0, 0.0, 0.0], [0.00016826161, 2.14326e-05, 3.6590063e-06, 1.0474137e-05, 1.9889994e-05, 1.3559657e-05, 1.2362689e-05, 4.5285396e-05, 2.5595149e-05, 3.2177886e-05, 4.0173072e-05, 4.5378834e-05, 3.1010073e-05, 2.6013367e-06, 3.341895e-05, 4.0911877e-06, 1.5387766e-06, 6.5155127e-06, 4.413986e-06, 2.8027973e-05, 0.00010784113, 7.582274e-06, 6.851435e-06, 1.461066e-05, 4.907482e-06, 6.184266e-06, 3.1935235e-06, 2.1006852e-06, 3.6703689e-06, 7.625084e-07, 4.212595e-09, 0.0, 0.0, 0.0], [9.1863825e-05, 2.2970933e-05, 1.1594859e-05, 1.7249668e-0