In [57]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from csv import reader

data = pd.read_csv('./Data/dataset.csv', delimiter='\t')

# On supprime champs non pertinents
data = data.drop(['duration', 'artist_terms_freq', 'artist_id', 'artist_terms_weight', 'danceability', 'energy', 'path', 'file', 'similar_artists', 'title', 'year', 'latitude', 'longitude', 'midi_name'], axis=1)

print(data.shape)

# On supprime les données sans hotttnesss ou avec hotttnesss = 0 (non calculée)
data = data.dropna(axis=0, how='any')
data = data[data['hotttnesss'] > 0]

(31034, 7)


In [59]:
print(data.head())
print(data.shape)

                                        artist_terms  key  mode  loudness  \
2  [b'hard rock' b'heavy metal' b'blues-rock' b'c...    7     1    -5.271   
5  [b'folk rock' b'singer-songwriter' b'rock' b'a...    2     1   -15.164   
6  [b'soft rock' b'blues-rock' b'pop rock' b'coun...    9     1    -8.531   
7  [b'outlaw country' b'country rock' b'southern ...    9     1    -6.291   
9  [b'hip hop' b'rap' b'funk' b'r&b' b'pop' b'sou...   11     0    -4.882   

     tempo  artist_familiarity  hotttnesss  
2  150.062            0.707200    0.684136  
5  103.905            0.775320    0.830423  
6  180.149            0.643183    0.767728  
7  185.061            0.138188    0.215080  
9  105.206            0.845602    0.624425  
(13151, 7)


In [60]:
# ajout de genres principaux

data['rock'] = data['artist_terms'].apply(lambda x: int('rock' in x))
data['pop'] = data['artist_terms'].apply(lambda x: int('pop' in x))
data['rap'] = data['artist_terms'].apply(lambda x: int('rap' in x))
data['country'] = data['artist_terms'].apply(lambda x: int('country' in x))
data['classical'] = data['artist_terms'].apply(lambda x: int('classical' in x))
data['jazz'] = data['artist_terms'].apply(lambda x: int('jazz' in x))

In [61]:
# on donne le label '1' aux chansons dont la hotttnesss est supérieure ou égale à un seuil
threshold = 0.5
data['hit_song'] = data['hotttnesss'].apply(lambda x: int(x >= threshold))
data['non_hit_song'] = data['hotttnesss'].apply(lambda x: int(x < threshold))

In [62]:
# on laisse de côté le champ "artist_terms" qui est non numérique, et hotttnesss
data = data.drop(['artist_terms', 'hotttnesss'], axis=1)

In [63]:
print(data.head())

   key  mode  loudness    tempo  artist_familiarity  rock  pop  rap  country  \
2    7     1    -5.271  150.062            0.707200     1    1    0        1   
5    2     1   -15.164  103.905            0.775320     1    1    0        0   
6    9     1    -8.531  180.149            0.643183     1    1    0        1   
7    9     1    -6.291  185.061            0.138188     1    1    0        1   
9   11     0    -4.882  105.206            0.845602     0    1    1        0   

   classical  jazz  hit_song  non_hit_song  
2          0     1         1             0  
5          0     0         1             0  
6          0     1         1             0  
7          0     1         0             1  
9          0     0         1             0  


In [64]:
# Dimensions of dataset
n = data.shape[0]
p = data.shape[1]

In [65]:
# on convertit data en array numpy
data = data.values

# on change toutes les valeurs en positif
data = np.absolute(data)

In [66]:
# on divise chaque colonne par le max de cettte colonne pour que chaque entrée soit comprise entre 0 et 1
data = data*1./np.max(data, axis=0)

In [67]:
# données d'entraînement, de test et de validation
train_start = 0
train_end = int(np.floor(0.7*n))
test_start = train_end
test_end = int(np.floor(0.9*n))
validation_start = test_end
validation_end = n

data_train = data[np.arange(train_start, train_end), :]
data_test = data[np.arange(test_start, test_end), :]
data_validation = data[np.arange(validation_start, validation_end), :]

[[0.63636364 1.         0.11914288 ... 1.         1.         0.        ]
 [0.18181818 1.         0.34275898 ... 0.         1.         0.        ]
 [0.81818182 1.         0.19283018 ... 1.         1.         0.        ]
 ...
 [0.81818182 1.         0.29477182 ... 0.         1.         0.        ]
 [0.09090909 1.         0.49377274 ... 0.         0.         1.        ]
 [0.36363636 1.         0.10682399 ... 1.         1.         0.        ]]


In [68]:
# on définit x et y
X_train = data_train[:, :-2]
y_train = data_train[:, -2:]
X_test = data_test[:, :-2]
y_test = data_test[:, -2:]
X_validation = data_test[:, :-2]
y_validation = data_test[:, -2:]

In [69]:
# nombre de paramètres pour entrée X
n_params = X_train.shape[1]
# nombre de catégories (ici deux)
n_classes = 2
#learning rate
learning_rate = 0.005

# neurones
n_neurons_1 = 2048
n_neurons_2 = 1024

# placeholder
X = tf.placeholder(dtype=tf.float32, shape=[None, n_params])
Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes])


In [71]:
# initialisation des poids et des biais
sigma = 1
weight_initializer = tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform", scale=sigma)
bias_initializer = tf.zeros_initializer()

# poids de la couche cachée
W_hidden_1 = tf.Variable(weight_initializer([n_params, n_neurons_1]))
bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1]))
W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2]))
bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2]))

# poids en sortie
W_out = tf.Variable(weight_initializer([n_neurons_2, n_classes]))
bias_out = tf.Variable(bias_initializer([1]))

# couches cachées
hidden_1 = tf.nn.relu(tf.add(tf.matmul(X, W_hidden_1), bias_hidden_1))
hidden_2 = tf.nn.relu(tf.add(tf.matmul(hidden_1, W_hidden_2), bias_hidden_2))

# couche de sortie
out = tf.add(tf.matmul(hidden_2, W_out), bias_out)

# fonction de cout
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=Y))

# fonction d'optimisation (descente de gradient)
train_op = tf.train.GradientDescentOptimizer(0.005).minimize(cost)




In [72]:
batch_size = 16
training_epochs = 50

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(training_epochs):
        
        # on fait une permutation sur les données d'entraînement
        shuffle_indices = np.random.permutation(np.arange(len(y_train)))
        X_train = X_train[shuffle_indices]
        y_train = y_train[shuffle_indices]
    
        avg_cost = 0.
        total_batch = len(y_train) // batch_size
        # Boucle sur tous les batchs
        for i in range(total_batch):
            start = i * batch_size
            batch_x = X_train[start:start + batch_size]
            batch_y = y_train[start:start + batch_size]

            _, c = sess.run([train_op, cost], feed_dict={X: batch_x, Y: batch_y})
            
            # on calcule le cout moyen
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % 5 == 0:
            print("Epoch:", '%03d' % (epoch+1), "cost={:.6f}".format(avg_cost))
    print("Optimisation terminée !")
    
    # Test model
    pred = tf.nn.softmax(out)  # Apply softmax to logits
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({X: X_validation, Y: y_validation}))

Epoch: 0001 cost=0.693914577
Epoch: 0006 cost=0.679955676
Epoch: 0011 cost=0.671332818
Epoch: 0016 cost=0.663313736
Epoch: 0021 cost=0.654937893
Epoch: 0026 cost=0.646566366
Epoch: 0031 cost=0.638229202
Epoch: 0036 cost=0.631207055
Epoch: 0041 cost=0.624564221
Epoch: 0046 cost=0.620619909
Epoch: 0051 cost=0.616999960
Epoch: 0056 cost=0.614886055
Epoch: 0061 cost=0.613879448
Epoch: 0066 cost=0.613134370
Epoch: 0071 cost=0.611953735
Epoch: 0076 cost=0.611899098
Epoch: 0081 cost=0.611043771
Epoch: 0086 cost=0.610667638
Epoch: 0091 cost=0.609911061
Epoch: 0096 cost=0.610361455
Optimization Finished!
Accuracy: 0.66501904
