In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense


In [None]:
# Load the data
data_path = "/home/24694266/DataScience344/Project/RNNModels/Filterd.csv"
data = pd.read_csv(data_path)

# 1. Randomly sample 50,000 observations from the data
data_sample = data.sample(n=50000, random_state=42)

# 2. Apply the preprocessing steps to this subset
tokenizer = Tokenizer(oov_token='<OOV>', num_words=1000)  # Limiting to 1000 words due to max_words
tokenizer.fit_on_texts(data_sample['Lyrics_Processed'])
sequences = tokenizer.texts_to_sequences(data_sample['Lyrics_Processed'])
padded_sequences = pad_sequences(sequences, maxlen=30, truncating='post', padding='post')  # maxlen=30 due to the model

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(data_sample['genre'])
labels_one_hot = to_categorical(labels_encoded, num_classes=6)
labels = labels_one_hot

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
max_words = 1000
embedding_dim = 16
model = Sequential()
    
# Embedding layer
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=30))
model.add(Dropout(0.5))

# First LSTM layer with dropout and kernel regularization
model.add(LSTM(64, return_sequences=True, dropout=0.4, recurrent_dropout=0.4, kernel_regularizer=l2(0.01)))
model.add(Dropout(0.4))


# Second LSTM layer with dropout and kernel regularization
model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4, kernel_regularizer=l2(0.01)))
model.add(Dropout(0.4))

# Dense layer with kernel regularization
model.add(Dense(32, activation='tanh', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train, epochs=1200, batch_size=64, validation_data=(X_test, y_test), verbose=1)


In [None]:
# Plot the training history
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Over Epochs')

plt.tight_layout()
plt.show()