Imports

In [38]:
from __future__ import print_function
import os
import numpy as np
import sys
import subprocess
import tarfile
import xml.etree.ElementTree as ET
from IPython.display import display, Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from six.moves import range
from yaafelib import *

In [115]:
class IsmisFeatures:
    
    def __init__(self, audiofile):
        self.audiofile = audiofile
        
        self.temporal_centroid = None      #1: Temporal Centroid
        self.spectral_centroid = None      #2: Spectral Centroid average value
        self.ase_per_band_avg = []         #4-37: Audio Spectrum Envelope (ASE) average values in 34 frequency bands
        self.ase_avg = None                #38: ASE average value (averaged for all frequency bands)
        self.ase_per_band_var = []         #39-72: ASE variance values in 34 frequency bands
        self.ase_var_avg = None            #73: averaged ASE variance parameters
        self.centroid_avg = None           #74: Audio Spectrum Centroid – average
        self.centroid_var = None           #75: Audio Spectrum Centroid – variance
        self.spread_avg = None             #76: Audio Spectrum Spread – average
        self.spread_var = None             #77: Audio Spectrum Spread – variance
        self.sfm_per_band_avg = []         #78-101: Spectral Flatness Measure (SFM) average values for 24 frequency bands
        self.sfm_avg = None                #102: SFM average value (averaged for all frequency bands)
        self.sfm_per_band_var = []         #103-126: Spectral Flatness Measure (SFM) variance values for 24 frequency bands
        self.sfm_var_avg = None            #127: averaged SFM variance parameters
        self.mfcc = []                     #128-147: 20 first mel cepstral coefficients average values
        self.fv = []
        
    def export_features(self):
        self.fv.append(self.temporal_centroid)
        self.fv.append(self.spectral_centroid)
        self.fv.extend(self.ase_per_band_avg)
        self.fv.append(self.ase_avg)
        self.fv.extend(self.ase_per_band_var)
        self.fv.append(self.ase_var_avg)
        self.fv.append(self.centroid_avg)
        self.fv.append(self.centroid_var)
        self.fv.append(self.spread_avg)
        self.fv.append(self.spread_var)
        self.fv.extend(self.sfm_per_band_avg)
        self.fv.append(self.sfm_avg)
        self.fv.extend(self.sfm_per_band_var)
        self.fv.append(self.sfm_var_avg)
        self.fv.extend(self.mfcc.tolist()[0])
    
        
    def extract_features(self):
        if os.path.exists(self.audiofile):
            print('Getting features from ' + self.audiofile)
        else:
            raise Exception('File ' + self.audiofile + ' not found')
            
        self.extract_mpeg7_features()
        self.extract_mfcc()
        
    def extract_mpeg7_features(self):
        ns = {'xmlns' : 'urn:mpeg:mpeg7:schema:2001',
              'mpeg7' : 'urn:mpeg:mpeg7:schema:2001',
              'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
              'xsi:schemaLocation' : 'urn:mpeg:mpeg7:schema:2001 http://mpeg7audioenc.sourceforge.net/mpeg7audioenc.xsd'} 

        result = subprocess.check_output(['java', '-jar', 'MPEG7AudioEnc.jar', self.audiofile, 'mpeg7config.xml'])
        root = ET.fromstring(result)

        temporal_centroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='TemporalCentroidType']", ns)
        spectral_cetroid_xml = root.find(".//mpeg7:AudioDescriptor[@xsi:type='SpectralCentroidType']", ns)
        audio_spectrum_centroid = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumCentroidType']", ns)
        audio_spectrum_spread = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumSpreadType']", ns)
        audio_spectrum_envelope = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumEnvelopeType']", ns)
        audio_spectrum_flatness = root.find(".//mpeg7:AudioDescriptor[@xsi:type='AudioSpectrumFlatnessType']", ns)

        self.temporal_centroid = self.parse_xml_scalar(temporal_centroid_xml, ns)
        self.spectral_centroid = self.parse_xml_scalar(spectral_cetroid_xml, ns)
        
        envelope_values = self.parse_2d_xml_vector(audio_spectrum_envelope, ns)          
        self.ase_per_band_avg = [np.mean(band) for band in envelope_values]
        self.ase_avg = np.mean(self.ase_per_band_avg)
        self.ase_per_band_var = [np.var(band) for band in envelope_values]
        self.ase_var_avg = np.mean(self.ase_per_band_var)
        
        centroid_values = self.parse_xml_vector(audio_spectrum_centroid, ns)
        self.centroid_avg = np.mean(centroid_values)
        self.centroid_var = np.var(centroid_values)
        
        spread_values = self.parse_xml_vector(audio_spectrum_spread, ns)
        self.spread_avg = np.mean(spread_values)
        self.spread_var = np.var(spread_values)
        
        flatness_values = self.parse_2d_xml_vector(audio_spectrum_flatness, ns)
        self.sfm_per_band_avg = [np.mean(band) for band in flatness_values]
        self.sfm_avg = np.mean(self.sfm_per_band_avg)
        self.sfm_per_band_var = [np.var(band) for band in flatness_values]
        self.sfm_var_avg = np.mean(self.ase_per_band_var)
    
    def parse_xml_scalar(self, element, ns):
        return float(element.find(".//mpeg7:Scalar", ns).text)
    
    def parse_xml_vector(self, element, ns):
        values_splitted = element.find(".//mpeg7:Raw", ns).text.split()
        return map((lambda x: float(x)), values_splitted)
    
    def parse_2d_xml_vector(self, element, ns):
        values_string = element.find(".//mpeg7:Raw", ns).text
        values_splitted = [s.strip().split() for s in values_string.splitlines()]
        values = [map((lambda x: float(x)), value) for value in values_splitted] #cast to float
        transposed_values = list(map(list, zip(*values))) #transpose matrix to have 1 long vector per 1 band
        return transposed_values 
    
    def extract_mfcc(self):
        fp = FeaturePlan(sample_rate=22050, normalize=1)
        fp.addFeature('mfcc: MFCC CepsNbCoeffs=20')
        df = fp.getDataFlow()
        engine = Engine()
        engine.load(df)
        afp = AudioFileProcessor()
        afp.setOutputFormat('csv', 'features', {'Precision': '8', 'Metadata': 'False'})
        afp.processFile(engine, self.audiofile)
        engine.flush()
        feats = engine.readAllOutputs()
        self.mfcc = feats['mfcc']

In [117]:
root = os.path.join(os.getcwd(),'genres')

genres = os.listdir(root)
mappings = dict(enumerate(genres))
mappings_rev = {v: k for k, v in mappings.items()}
dataset_xs = []
dataset_y = []

for folder in os.listdir(root):
    data_folder = os.path.join(root, folder)
    filenames = os.listdir(data_folder)
    for filename in filenames:
        path=os.path.join(data_folder, filename)
        genre = mappings_rev[folder]
        ismisFeature = IsmisFeatures(path)
        ismisFeature.extract_features()
        ismisFeature.export_features()
        dataset_xs.append(ismisFeature.fv)
        dataset_y.append(genre)

Getting features from /notebooks/genres/classical/classical.00015.au
Getting features from /notebooks/genres/classical/classical.00061.au
Getting features from /notebooks/genres/classical/classical.00012.au
Getting features from /notebooks/genres/classical/classical.00050.au
Getting features from /notebooks/genres/classical/classical.00098.au
Getting features from /notebooks/genres/classical/classical.00086.au
Getting features from /notebooks/genres/classical/classical.00002.au
Getting features from /notebooks/genres/classical/classical.00099.au
Getting features from /notebooks/genres/classical/classical.00054.au
Getting features from /notebooks/genres/classical/classical.00090.au
Getting features from /notebooks/genres/classical/classical.00060.au
Getting features from /notebooks/genres/classical/classical.00091.au
Getting features from /notebooks/genres/classical/classical.00006.au
Getting features from /notebooks/genres/classical/classical.00077.au
Getting features from /notebooks/g

In [132]:
print(dataset_y)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [129]:
def evens(dataset):
    return dataset[::2]
def odds(dataset):
    return dataset[1::2]

train_dataset_xs = evens(dataset_xs)
train_dataset_y = evens(dataset_y)
test_dataset_xs = odds(dataset_xs)
test_dataset_y = odds(dataset_y)

def randomize(dataset_xs, dataset_y):
    permutation = np.random.permutation(len(dataset_y))
    shuffled_dataset_xs = np.asarray(dataset_xs)[permutation]
    shuffled_dataset_y = np.asarray(dataset_y)[permutation]
    return shuffled_dataset_xs, shuffled_dataset_y

train_dataset_xs, train_dataset_y = randomize(train_dataset_xs,train_dataset_y)
test_dataset_xs, test_dataset_y = randomize(test_dataset_xs, test_dataset_y)

In [121]:
try:
    f = open('data/gztan.pickle', 'wb')
    save = {
        'train_dataset_xs': train_dataset_xs,
        'train_dataset_y': train_dataset_y,
        'test_dataset_xs': test_dataset_xs,
        'test_dataset_y': test_dataset_y,
        'mappings': mappings
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise