# TRANSFERT LEARNING USING THE SOUNDNET MODEL ON THE GTZAN DATABASE

### Import

In [1]:
import soundfile, torch
import torchaudio
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt

import os
import glob

import pandas as pd
import numpy as np

## Model
### Here we are defining the model and loading the pre-trained soundnet model

In [2]:
from pytorch_model import SoundNet8_pytorch
from utils import vector_to_scenes,vector_to_obj

## define the soundnet model
model = SoundNet8_pytorch()

## Load the weights of the pretrained model
model.load_state_dict(torch.load('sound8.pth'))

<All keys matched successfully>

### We define the all the data paths

In [4]:
genre_dict = {"blues":0,"classical":1,"country":2,"disco":3,"hiphop":4,"jazz":5,"metal":6,"pop":7,"reggae":8,"rock":9}

# Get all the audio file path
df = pd.read_csv("features_30_sec.csv")
list_filename = list(df.filename)

# Split the list in sublists by genre
list_filepath = []
for i in range(10):
    list_filepath.append(list_filename[100*i: 100*(i+1)])

# Complete all the paths
list_audiofile_path = []

for genre, key in genre_dict.items():
    sublist = []
    
    for filepath in list_filepath[key]:
        if filepath != "\genre_original\blues\blues_00054.wav":
            path = os.path.join(r".\genres_original", genre , filepath )
            sublist.append(path)
        
    list_audiofile_path.append(sublist)
    
    

We want to isolate the fifth layer features and then re-classify them with a classifier method

After testing we find that the dimension of the fifth layer features are (1,256,81,1)

## Functions 

In [5]:
# Collect the features wanted from the soundnet modelling of the given audiofile_path

def get_soundnet_features(audiofile_path, n_feature):
    waveform, sr = torchaudio.load(audiofile_path)
    # Reshape the data to the format expected by the model (Batch, 1, time, 1)
    waveform = waveform.view(1,1,-1,1)

    # Extract the features we want
    features = model.extract_feat(waveform)  #features 7 and 8 are object_pred and scene_pred

    # Get the good features
    feature = features[n_feature]

    return feature

In [6]:
# Extract all the fifth layer features and so, create the dataset we will perform the classification on

def create_dataset(list_of_path, genre_dict, n_feature):
    X = []
    Y = []

    for genre, key in genre_dict.items():
        for path in list_of_path[key]:
            if path != '.\\genres_original\\jazz\\jazz.00054.wav':
                X.append(np.array(get_soundnet_features(path, n_feature-1)))
                Y.append(genre)

    return X, Y

In [7]:
x_5,y_5 = create_dataset(list_audiofile_path, genre_dict, 5)

In [8]:
#df_x_5 = pd.DataFrame(x_5)
#df_x_5.to_csv("x_5.csv")

#df_y_5 = pd.DataFrame(y_5)
#df_y_5.to_csv("y_5.csv")

  values = np.array([convert(v) for v in values])


ValueError: could not broadcast input array from shape (256,81,1) into shape (1,256)

In [31]:
x = x_5
y = y_5
X = []
Y = []

for idx, xi in enumerate(x):
    if xi.shape == x[0].shape:
        X.append(xi)
        Y.append(y[idx])


X = np.array(X)
X = X.reshape(X.shape[0], -1)

Y = np.array(Y)



x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)


In [None]:
# traintestsplit le dataset fait
# chosir quelle couche je prend de soundet pour le transfert learning fait
# train un nouveau modèle simple (uen simple classification sur scikit-learn)
# comparer les résultats

## CLASSIFICATION 
### We will try different types of classifiers in order to get the best results for the classification and then compare it with the results of these classifiers without this transfert learning process

#### UPDATE : AFTER TESTING SOME CLASSIFIER, WE CHOOSE TO USE THE SVM AS PRECONISED IN THE WORKPAPER

In [32]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

svm = SVC(decision_function_shape="ovo")

def classification(model, x_train, x_test, y_train, y_test):

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))



In [33]:
# SVM (poly)
model = SVC(C = 2, kernel = "poly")
classification(model, x_train, x_test, y_train, y_test)

[[13  7  2  4  1  0  3  1 10  9]
 [ 1 29  1  0  0  1  1  1  1 10]
 [ 6 11  6  3  0  0  4  0  5 13]
 [10  5  2  8  1  0  5  5  9  8]
 [11  2  1  4  2  1  4  2 12  4]
 [ 5 25  4  0  0  9  2  1  5  5]
 [ 5 16  5  0  1  0  8  2  4 11]
 [ 1  2  2  0  2  1  3 19  4 18]
 [ 8  1  2  5  0  0  2  2 20  5]
 [ 5  8  4  2  0  1  6  2  4 12]]
0.2581967213114754


## ANOTHER TRY WITH THE FEATURES FROM THE ANTEPENULTIMATE LAYER (7)

In [29]:
## define the soundnet model
model = SoundNet8_pytorch()

## Load the weights of the pretrained model
model.load_state_dict(torch.load('sound8.pth'))

x_7,y_7 = create_dataset(list_audiofile_path, genre_dict, 7)

975
(975, 21504)
(975,)
[[ 4  1  9  3  3  0 10  0  6  1]
 [ 1  4 17  0  1  0 11  0  0  0]
 [ 1  0 16  2  2  0  8  0  2  1]
 [ 0  0  6  5  7  0  9  0  7  1]
 [ 1  1  2  1 13  0  7  0  3  0]
 [ 2  6 13  3  1  2  8  0  3  0]
 [ 0  0  9  1  3  0 14  0  1  1]
 [ 2  2 13  8  2  0 11  2  1  1]
 [ 4  1  7  7  6  0  4  0  5  3]
 [ 1  2 11  4  2  0  7  0  2  1]]
0.19298245614035087


In [34]:
x = x_7
y = y_7

X = []
Y = []

for idx, xi in enumerate(x):
    if xi.shape == x[0].shape:
        X.append(xi)
        Y.append(y[idx])


print(len(X))

X = np.array(X)
X = X.reshape(X.shape[0], -1)
print(X.shape)
Y = np.array(Y)
print(Y.shape)


x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)

# SVM (poly)
model = SVC(C = 2, kernel = "poly")
classification(model, x_train, x_test, y_train, y_test)

975
(975, 21504)
(975,)
[[14  6  0  8  0  1  0  2  9  7]
 [11 14  0  1  0  1  1  5  0 11]
 [ 9  6  2  3  0  1  1  8  4 17]
 [ 7  3  0 12  0  0  1 10  5 10]
 [ 8  3  0 20  1  1  2  2  8  8]
 [12  8  0  7  0  6  0  3  4  7]
 [11  4  1  2  0  0  2  3  4 25]
 [ 7  6  1  9  0  0  0 16  3  7]
 [10  2  0 19  0  0  1  1 11  6]
 [ 6 11  0  4  0  0  3  8  5 10]]
0.18032786885245902
