# Projet SAM -> Classifieur de genre musical

## Late Fusion (Text, Audio, Image)

In [3]:
import os
import pandas as pd
import numpy as np
import sklearn
import cv2
import imutils
from skimage.feature import hog

### Codify Labels

In [4]:
def get_label_list():
    df = pd.read_csv('./msdi/'+'labels.csv', header=None)
    label_list = list(df.iloc[:, 0])
    labels = {label_list[i]:i for i in range (0,len(label_list))} # Dictionary Etiquetes Possibles
    return labels

In [5]:
labels = get_label_list()
print(labels)

{'Blues': 0, 'Country': 1, 'Electronic': 2, 'Folk': 3, 'Jazz': 4, 'Latin': 5, 'Metal': 6, 'New Age': 7, 'Pop': 8, 'Punk': 9, 'Rap': 10, 'Reggae': 11, 'RnB': 12, 'Rock': 13, 'World': 14}


In [6]:
def read_through(entries,labels):
    X = []
    Y = []
    for i in range(len(entries)):
        
        entry = entries.iloc[i,:]
        x = entry[4:len(entry)]
        
        y = labels[entry["genre"]]
        
        X.append(x)
        Y.append(y)
        
    return X, Y

## Load Text Data

In [7]:
text_data = pd.read_csv('./msdi/mxm/joint_mxm.csv')

In [8]:
train_text = text_data[text_data['set'] == "train"]
validation_text = text_data[text_data['set'] == "val"]
test_text = text_data[text_data['set'] == "test"]

In [11]:
x_train_text, y_train_text = read_through(train_text,labels)
x_test_text, y_test_text = read_through(test_text,labels)
x_validation_text, y_validation_text = read_through(validation_text,labels)

In [12]:
x_train_text = np.asarray(x_train_text, dtype=np.float32)
x_validation_text = np.asarray(x_validation_text, dtype=np.float32)
x_test_text = np.asarray(x_test_text, dtype=np.float32)

In [15]:
len(x_train_text)

9260

In [16]:
len(y_train_text)

9260

### Classify Text and fetch predicted labels with KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
neigh = KNeighborsClassifier(n_neighbors=15)
neigh.fit(x_train_text, y_train_text)

KNeighborsClassifier(n_neighbors=15)

#### Predicted Labels by KNN (for late fusion)

In [19]:
text_accu = neigh.score(x_test_text,y_test_text)

In [20]:
text_accu

0.3021164021164021

In [21]:
text_pred = neigh.predict(x_test_text)

In [22]:
len(text_pred)

1890

## Load Image Data

In [23]:
img_data = pd.read_csv('./msdi/mxm/joint_hog_f.csv')

In [24]:
train_img = img_data[img_data['set'] == "train"]
validation_img = img_data[img_data['set'] == "val"]
test_img = img_data[img_data['set'] == "test"]

In [25]:
x_train_img, y_train_img = read_through(train_img,labels)
x_test_img, y_test_img = read_through(test_img,labels)
x_validation_img, y_validation = read_through(validation_img,labels)

In [26]:
x_train_img = np.asarray(x_train_img, dtype=np.float32)
x_validation_img = np.asarray(x_validation_img, dtype=np.float32)
x_test_img = np.asarray(x_test_img, dtype=np.float32)

In [29]:
len(x_train_img)

9260

In [30]:
len(y_train_img)

9260

### Classify Images and fetch predicted labels with Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
clf = LogisticRegression(random_state=0)
clf.fit(x_train_img, y_train_img)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

#### Predicted Labels by Logistic Regression (for late fusion)

In [37]:
img_accu = clf.score(x_test_img,y_test_img)

In [38]:
img_accu

0.15925925925925927

In [39]:
img_pred = neigh.predict(x_test_text)

In [40]:
len(img_pred)

1890

## Load Audio Data

In [41]:
audio_data = pd.read_csv('./msdi/mxm/joint_mfcc.csv')

In [42]:
train_audio = audio_data[audio_data['set'] == "train"]
validation_audio = audio_data[audio_data['set'] == "val"]
test_audio = audio_data[audio_data['set'] == "test"]

In [67]:
x_train_audio, y_train_audio = read_through(train_audio,labels)
x_test_audio, y_test_audio = read_through(test_audio,labels)
x_validation_audio, y_validation_audio = read_through(validation_audio,labels)

In [None]:
x_train_audio = np.array(x_train_audio)
x_validation_audio = np.array(x_validation_audio)
x_test_audio = np.array(x_test_audio)

In [None]:
y_train_audio = np.array(y_train_audio)
y_validation_audio = np.array(y_validation_audio)
y_test_audio = np.array(y_test_audio)

### CNN

In [62]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

In [63]:
model_cnn = keras.Sequential([

        # input layer
        keras.layers.Flatten(input_shape=(6000,)),

        # 1st dense layer
        keras.layers.Dense(512, activation='relu'),

        # 2nd dense layer
        keras.layers.Dense(256, activation='relu'),

        # 3rd dense layer
        keras.layers.Dense(64, activation='relu'),

        # output layer
        keras.layers.Dense(15, activation='softmax')
    ])

In [64]:
optimiser = Adam(learning_rate=0.0001)
model_cnn.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model_cnn.fit(x_train_audio, y_train_audio, validation_data=(x_validation_audio, y_validation_audio), batch_size=32, epochs=20)

In [None]:
y_prob = model.predict(x) 
y_classes = y_prob.argmax(axis=-1)

## Late Fusion (text_pred, img_pred, audio_pred)

In [38]:
predictions = [text_pred, img_pred,audio_pred]
accus = [text_accu, img_accu,audio_accu]

In [39]:
predictions[0][0]

0

In [47]:
accus

[9, 4, 3]

In [50]:
fusion_pred = []
for i in range(len(img_pred)):
    y = [predictions[j][i] for j in range(len(predictions))]
    set_y = set(y)
    if (len(set_y) == len(y)):
        y = y[accus.index(max(accus))]
    else:
        y = max(set_y, key=y.count)
    fusion_pred.append(y)
    

In [51]:
fusion_pred

[0, 1, 2, 6, 7]

### Late Fusion Acuracy

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_text, fusion_pred) # y_test_text , y_test_img and y_test_audio are actually the same