In [77]:
import numpy as np
import IPython
import scipy.io.wavfile
import scipy.signal as sig
import matplotlib.pyplot as plt
import numpy.random as rng
import os
import librosa
from sklearn.model_selection import train_test_split 
from sklearn.mixture import GaussianMixture 

## Part 1

In [105]:
# STFT code from lab 1
from numpy.fft import rfft, irfft, fftfreq
def foward_transform(input_sound, dft_size, hop_size, zero_pad, window):
    input_len = len(input_sound)
    idx = 0
    segments = []
    while idx+dft_size < input_len:
        s = input_sound[idx:idx+dft_size]
        segments.append(np.multiply(s, window))
        idx += hop_size
    else:
        s = np.append(input_sound[idx:-1], np.zeros(idx+dft_size-input_len+1))
        segments.append(np.multiply(s, window))
    frames = np.array(segments)
    
    dft_frames = []
    for idx, x in enumerate(frames):
        dft_frames.append(rfft(x, dft_size + zero_pad))
    dft_frames = np.array(dft_frames, dtype=complex)
    return dft_frames 

def inverse_transform(input_sound, dft_size, hop_size, zero_pad, window):
    leng = hop_size*(len(input_sound)-1)+dft_size+zero_pad
    output = np.zeros(leng, dtype=complex)
    for idx, row in enumerate(input_sound):
        frame = np.multiply(irfft(row, dft_size+zero_pad)[:dft_size], window)
        output[idx*hop_size:idx*hop_size+dft_size] += frame
    return output

def stft( input_sound, dft_size, hop_size, zero_pad, window):
    if input_sound.ndim == 1 and np.isreal(input_sound).all():
        return foward_transform(input_sound, dft_size, hop_size, zero_pad, window)
    else:
        return inverse_transform(input_sound, dft_size, hop_size, zero_pad, window)
    
def plot_stft( input_sound, stft, dft_size, hop_size, zero_pad, frate, title=""):
    output = np.absolute(stft)**0.3
    time = np.linspace(0, output.shape[0] * hop_size / frate,output.shape[0])
    freq = np.linspace(0, frate/2, int((dft_size+zero_pad)/2 + 1))
    plt.pcolormesh(time, freq, output.T)
    plt.xlabel("Time (s)")
    plt.ylabel("Frequency (HZ)")
    plt.title(title)
    plt.show()
    
def plot_sound( input_sound, fs, title='input_sound'):
    plt.plot(np.linspace(0, len(input_sound)/fs, len(input_sound)), input_sound)
    plt.title(title)
    plt.xlabel('Time (s)')
    plt.show()
    
def process(input_sound):
    dft_size = 1024
    Hann_window = np.hanning(dft_size)
    hop_size = 128
    zero_pad = 0
    stft_ = stft(input_sound, dft_size, hop_size, zero_pad, Hann_window)
    return np.absolute(stft_)**0.3
    

In [106]:
frate = 0
speech_file = []
for filename in os.listdir("./data/SpeechMusic/speech"):
    frate, file = scipy.io.wavfile.read("./data/SpeechMusic/speech/"+filename)
    speech_file.append(process(file))
    
music_file = []
for filename in os.listdir("./data/SpeechMusic/music"):
    frate, file = scipy.io.wavfile.read("./data/SpeechMusic/music/"+filename)
    music_file.append(process(file))

In [132]:
speech_train, speech_test, music_train, music_test = train_test_split( speech_file, music_file, train_size=50, random_state=32)



In [133]:
speech_train = np.concatenate(np.array(speech_train), axis=0)
music_train = np.concatenate(np.array(music_train), axis=0)

In [156]:
gm_speech = GaussianMixture(5, "diag")
gm_speech = gm_speech.fit(speech_train)
gm_music = GaussianMixture(5, "diag")
gm_music = gm_music.fit(music_train)

In [157]:
correct = 0
for speech in speech_test:
    correct += 1 if gm_speech.score(speech) > gm_music.score(speech) else 0
for music in music_test:
    correct += 1 if gm_speech.score(music) < gm_music.score(music) else 0
accuracy = correct/(len(speech_test)+len(music_test))

In [158]:
print("Correct:",correct,"Test Size:",(len(speech_test)+len(music_test)),"Accuracy:",accuracy)

Correct: 19 Test Size: 20 Accuracy: 0.95


I have ran the code several times and the accuracy varies a lot from 0.65 to 0.95 due to different train/test sets. I set a random_state in test_train_split to fix the data set in order to better optimize the model. On avrage the accuracy would be around 0.85~0.9 or so.

## Part 2

In [232]:
def get_data(dir_name):
    frate = 1293
    arr_file = []
    for filename in os.listdir("./data/genres/"+dir_name):
        file, frate = librosa.core.load("./data/genres/"+dir_name+"/"+filename)
        file = np.array(file)
        mfcc = np.array(librosa.feature.mfcc(file, frate, n_mfcc=60))
        arr_file.append(mfcc)
    train, test = train_test_split(arr_file, test_size=0.5, random_state=35)
    train = np.concatenate(train, axis=1)
    return train, test, frate

Preprocess the train and test data

In [233]:
frate = 0
classical_train, classical_test, frate = get_data('classical')
disco_train, disco_test, frate = get_data('disco')
metal_train, metal_test, frate = get_data('metal')
pop_train, pop_test, frate = get_data('pop')
reggae_train, reggae_test, frate = get_data('reggae')

Train a GaussianMixture classifier for each class

In [234]:
gm_classical = GaussianMixture(10, "diag")
gm_classical = gm_classical.fit(classical_train.T)
gm_disco = GaussianMixture(10, "diag")
gm_disco = gm_disco.fit(disco_train.T)
gm_metal = GaussianMixture(10, "diag")
gm_metal = gm_metal.fit(metal_train.T)
gm_pop = GaussianMixture(10, "diag")
gm_pop = gm_pop.fit(pop_train.T)
gm_reggae = GaussianMixture(10, "diag")
gm_reggae = gm_reggae.fit(reggae_train.T)

evalutate our model

In [235]:
def evaluate(test_arr ,gm_arr):
    correct = 0
    for idx, class_ in enumerate(test_arr):
        for file in class_:
            scores = [gm.score(file.T) for gm in gm_arr]
            if np.argmax(scores) == idx:
                correct += 1
    return correct/ np.sum([len(x) for x in test_arr])

In [236]:
print("Accuracay: ",evaluate([classical_test, disco_test, metal_test, pop_test, reggae_test],[gm_classical, gm_disco, gm_metal, gm_pop, gm_reggae]))

Accuracay:  0.832


## Part 3

In [217]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

### Improving part 1

For Part1 I improved the accuracy from 0.9 to 0.95 by using more guassian components.

To use other classifiers, instead of training a classifier for each class, I am using one single classifier that can directly predict different classes. I appended the training data and provided the corresponding labels.

In [204]:
X = np.append(speech_train, music_train, axis=0)
y = np.append(np.zeros(speech_train.shape[0]), np.ones(music_train.shape[0]), axis=0)
# X is the combined train data
# y is the array of labels: 0 means speech, 1 means music

In [222]:
def part3_eval(speech_clf, speech_test, music_test):
    speech_clf = speech_clf.fit(X, y)
    correct = 0
    for speech in speech_test:
        res = speech_clf.predict(speech)
        if np.count_nonzero(res) < 0.5*len(res):
            correct += 1
    for music in music_test:
        res = speech_clf.predict(music)
        if np.count_nonzero(res) > 0.5*len(res):
            correct += 1
    accuracy = correct/(len(speech_test)+len(music_test))
    print("Correct:",correct,"Test Size:",(len(speech_test)+len(music_test)),"Accuracy:",accuracy)

MLP Classifier

In [224]:
speech_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,))
part3_eval(speech_clf, speech_test, music_test)

Correct: 19 Test Size: 20 Accuracy: 0.95


DecisionTreeClassifier

In [225]:
speech_clf = DecisionTreeClassifier(random_state=0)
part3_eval(speech_clf, speech_test, music_test)

Correct: 18 Test Size: 20 Accuracy: 0.9


GradientBoostingClassifier

In [226]:
speech_clf = GradientBoostingClassifier()
part3_eval(speech_clf, speech_test, music_test)

Correct: 18 Test Size: 20 Accuracy: 0.9


I tried using different classifiers like DecisionTreeClassifier, GradientBoostingClassifier, RandomForestClassifier  but they did not seem to have better performance since the performance was already very good. (And the test dataset is too small)

### Improving part 2 

I found out that the accuracy would improve a lot when I use more components in the GuassianMixture. Therefore I set the component number to 10. This has improved the accuracy from 0.75 to 0.816. Furthermore, I used more components (80) in mfcc and this improved the accuracy to 0.832.

I chose not to use other classifiers for part 2 because there are too many train data and it takes too long to run the code.