# Lyrics to genres multi-label classification

In [134]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.model_selection import train_test_split

import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np
import tensorflow as tf
import keras.backend as tfb

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPool1D, MaxPooling1D, Activation, Dropout, Conv1D
import keras.backend as tfb
from keras.utils.generic_utils import get_custom_objects
from keras.layers import LSTM, Bidirectional, SpatialDropout1D
from sklearn.metrics import multilabel_confusion_matrix

from keras.utils.generic_utils import get_custom_objects

In [170]:
POS_WEIGHT = 10  # multiplier for positive targets, needs to be tuned

def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor
    and a target tensor. POS_WEIGHT is used as a multiplier
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.math.log(output / (1 - output))
    # compute weighted loss
    target = tf.cast(target, tf.float32)
    loss = tf.nn.weighted_cross_entropy_with_logits(labels=target,
                                                    logits=output,
                                                    pos_weight=POS_WEIGHT)
    return tf.reduce_mean(loss, axis=-1)


get_custom_objects().update({"weighted_binary_crossentropy": weighted_binary_crossentropy})

def f1_score(y_true, y_logit):
    """
    Calculate F1 score
    y_true: true value
    y_logit: predicted value
    """
    true_positives = tfb.sum(tfb.round(tfb.clip(y_true * y_logit, 0, 1)))
    possible_positives = tfb.sum(tfb.round(tfb.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + tfb.epsilon())
    predicted_positives = tfb.sum(tfb.round(tfb.clip(y_logit, 0, 1)))
    precision = true_positives / (predicted_positives + tfb.epsilon())
    return (2 * precision * recall) / (precision + recall + tfb.epsilon())


def set_labels(x):
    labels = np.array([0 for i in range(len(unique_genres))])

    for item in x.split(","):
        labels[list(unique_genres).index(item)] = 1
    
    return np.array(labels)

def csv_to_train_val_data(path:str, split_rate = 0.15):
    # read data
    data = pd.read_csv(path)

    # fix data
    data["genre"] = data["genre"].apply(lambda x: x.replace("\n"," "))
    data["labels"] = data["genre"].apply(lambda x: set_labels(x))

    # data splitting
    x_train, x_test, y_train, y_test = train_test_split(data["lyrics"], data["labels"], test_size=split_rate, shuffle=True)

    y_train = np.stack(y_train.values).astype('float32')
    y_test = np.stack(y_test.values).astype('float32')

    # tokenize data
    tokenizer = Tokenizer(lower=True, num_words=max_words)
    tokenizer.fit_on_texts(list(x_train) + list(x_test))

    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)

    x_train = pad_sequences(x_train, maxlen=maxlen)
    x_test = pad_sequences(x_test, maxlen=maxlen)

    print(f'X train shape: {x_train.shape}')
    print(f'X test shape: {x_test.shape}')
    print(f'Y train shape: {y_train.shape}')
    print(f'Y test shape: {y_test.shape}')


    return x_train, x_test, y_train, y_test

def visualise_history(hist):
    measures = ['loss', 'accuracy', 'f1_score']

    fig, ax = plt.subplots(3, figsize= (12,12))
    
    for i, measure in enumerate(measures):
        val_measure = 'val_' + measure
        ax[i].plot(hist.history[measure])
        ax[i].set_title(measure)
        ax[i].plot(hist.history[val_measure])
        ax[i].set_title(val_measure)
        # ax[i].set_xlabel("Epochs")
        ax[i].set_ylabel(measure)
    plt.xlabel('epochs')
    plt.show()
    

def run_model(model_name, batch_size=64, epochs=5, optimizer='adam', loss='weighted_binary_crossentropy', verbose=1):
    model = models_dict[model_name]
    model.compile(optimizer=optimizer, 
            loss=loss, 
            metrics=['accuracy', f1_score, tf.keras.metrics.AUC()])
    hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), verbose=verbose)
    
    return hist, model
        

def create_confusion_matrix(model, threshold=0.75):
    y_pred = model.predict(x_test)

    y_pred[y_pred>threshold] = 1
    y_pred[y_pred<=threshold] = 0


    print(multilabel_confusion_matrix(y_test, y_pred))



### STARTING PARAMETERS

In [93]:
max_words = 30_000
maxlen = 200
output_dim = 64
embedding_dim = 300

### FASTTEXT LOAD

In [11]:
fasttext = KeyedVectors.load_word2vec_format(r'wiki-news-300d-1M.vec', binary=False, encoding='utf8')


### CREATE UNIQUE GENRES

In [100]:
unique_genres = set()

for row in data.genre:
    for item in row.split(","):
        unique_genres.add(item)

num_classes = len(unique_genres)

### EMBEDDING

In [85]:
vocab_size = len(tokenizer.word_index) + 1

weight_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = fasttext[word]
        weight_matrix[i] = embedding_vector
    except KeyError:
        weight_matrix[i] = np.random.uniform(-5, 5, embedding_dim)               

### LOAD DATA

In [103]:
x_train, x_test, y_train, y_test = csv_to_train_val_data(path=r'data_cleaned/final_cleaned_labeled2.csv', split_rate=0.15)

X train shape: (70624, 200)
X test shape: (12464, 200)
Y train shape: (70624, 15)
Y test shape: (12464, 15)


### DECLARE MODELS IN DICTIONARY

In [179]:
models_dict = { 
    "cnn1_emb": Sequential([
                    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen, weights=[weight_matrix], trainable=False),
                    Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer='l1_l2'),
                    MaxPooling1D(),
                    Dropout(0.3),
                    Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer='l1_l2'),
                    GlobalMaxPool1D(),
                    Dense(100, kernel_regularizer='l1_l2'),
                    Dropout(0.5),
                    Activation('relu'),
                    Dense(num_classes, name="output"),
                    Activation('sigmoid'),
                        ]),
    "lstm1_emb": Sequential([
                    
                    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen, weights=[weight_matrix], trainable=False),
                    SpatialDropout1D(0.3),
                    Bidirectional(LSTM(units=256, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', 
                                                recurrent_dropout=0.0, dropout=0.5, kernel_initializer='glorot_uniform'),
                                            merge_mode='concat'),
                    Bidirectional(LSTM(units=256, return_sequences=True, activation='tanh', recurrent_activation='sigmoid', 
                                                recurrent_dropout=0.0, dropout=0.5, kernel_initializer='glorot_uniform'),
                                            merge_mode='concat'),
                    Dropout(0.3),
                    GlobalMaxPool1D(),
                    Dense(num_classes),
                    Activation('sigmoid')
                            ])
    
}

### RUNNING THE NETS

In [181]:
hist, curr_model = run_model('lstm1_emb', batch_size=32, epochs=30, 
                            optimizer='adam', loss='weighted_binary_crossentropy', verbose=1)
curr_model.evaluate(x_test)
visualise_history(hist)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

KeyboardInterrupt: 

### Save/Load

In [None]:
curr_model.save(r'models/test.h5')

In [None]:
new_model = tf.keras.models.load_model(r'models/model_cnn_fasttext.h5')

### LSTM