Extract Data From Dataset

In [13]:
import pandas as pd
#Baca dataset
train_data = pd.read_csv('../dataset/train.csv')
test_data = pd.read_csv('../dataset/test.csv')

Clean Data and Preprocessing

In [14]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [15]:
# Ensure all lyrics are strings and handle missing values
train_data['Lyrics'] = train_data['Lyrics'].astype(str)
test_data['Lyrics'] = test_data['Lyrics'].astype(str)
train_data = train_data.dropna(subset=['Lyrics'])
test_data = test_data.dropna(subset=['Lyrics'])

# Encode artist labels
label_encoder = LabelEncoder()
label_encoder.fit(train_data['Artist'])
y_train = label_encoder.transform(train_data['Artist'])

# Filter out unseen labels in the test set
test_data_filtered = test_data[test_data['Artist'].isin(label_encoder.classes_)]
y_test = label_encoder.transform(test_data_filtered['Artist'])

# Preprocess lyrics text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['Lyrics'])
train_sequences = tokenizer.texts_to_sequences(train_data['Lyrics'])
test_sequences = tokenizer.texts_to_sequences(test_data_filtered['Lyrics'])
X_train = pad_sequences(train_sequences, maxlen=500)
X_test = pad_sequences(test_sequences, maxlen=500)

In [16]:
print(len(X_train))
print(len(y_train))

290183
290183


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=500))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 500, 128)         98816     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 500, 128)          0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 11152)            

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, save_weights_only=True)

history = model.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=64, 
                    validation_data=(X_test, y_test), 
                    callbacks=[early_stopping, model_checkpoint])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


In [20]:
model.save('backup_model.h5')

In [None]:
find_model = tf.keras.models.load_model('best_model.h5')

In [9]:
import pickle
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)