In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np

In [None]:
data_path = "/home/24694266/DataScience344/Project/RNNModels/Filterd.csv"
data = pd.read_csv(data_path)

In [6]:
# Sample 1000 songs
data_sample = data.sample(n=50000, random_state=42)

# Extract features and target variable
X = data_sample[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                 'duration_ms', 'time_signature']]
y = data_sample['genre']

# Convert genres to numerical labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# One-hot encode the 'time_signature' column
time_signature_dummies = pd.get_dummies(X['time_signature'], prefix='time_signature')
X = pd.concat([X.drop('time_signature', axis=1), time_signature_dummies], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalize the features (excluding one-hot encoded columns)
columns_to_scale = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                   'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                   'duration_ms']
scaler = StandardScaler()
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# Check and handle missing values
if X_train.isnull().sum().sum() > 0:
    X_train.fillna(X_train.mean(), inplace=True)

if X_test.isnull().sum().sum() > 0:
    X_test.fillna(X_test.mean(), inplace=True)
    # Convert labels to one-hot encoded format
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


# Mixture model:

# Prepare the data

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize and pad the lyrics
maxlen = 30
tokenizer = Tokenizer()
train_data = pd.read_csv("DataMixtureModel.csv")
# Assuming `data` is your dataset
data_sample = train_data.sample(n=1000, random_state=42)
train_data, test_data = train_test_split(data_sample, test_size=0.2, random_state=42)

tokenizer.fit_on_texts(train_data['Lyrics_Processed'])

train_sequences = tokenizer.texts_to_sequences(train_data['Lyrics_Processed'])
test_sequences = tokenizer.texts_to_sequences(test_data['Lyrics_Processed'])

padded_sequences_train = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
padded_sequences_test = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')

# One-hot encode genres
label_encoder = LabelEncoder()
integer_encoded_train = label_encoder.fit_transform(train_data['genre'])
integer_encoded_test = label_encoder.transform(test_data['genre'])

encoder = OneHotEncoder(sparse=False)
integer_encoded_train = integer_encoded_train.reshape(len(integer_encoded_train), 1)
integer_encoded_test = integer_encoded_test.reshape(len(integer_encoded_test), 1)

y_train = encoder.fit_transform(integer_encoded_train)
y_test = encoder.transform(integer_encoded_test)

# Scale features
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
            'duration_ms', 'time_signature']

scaler = StandardScaler()
X_train_features = scaler.fit_transform(train_data[features]).astype(np.float32)
X_test_features = scaler.transform(test_data[features]).astype(np.float32)


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, concatenate

from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

# Define max words and embedding dimension

max_words = len(tokenizer.word_index) + 1
embedding_dim = 30

# LSTM model for Lyrics
lyrics_input = Input(shape=(maxlen,), name='lyrics_input')
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen)(lyrics_input)
embedding_dropout = Dropout(0.5)(embedding_layer)
lstm_1 = LSTM(64, return_sequences=True, dropout=0.4, recurrent_dropout=0.4, kernel_regularizer=l2(0.01))(embedding_dropout)
lstm_dropout_1 = Dropout(0.4)(lstm_1)
lstm_2 = LSTM(64, dropout=0.4, recurrent_dropout=0.4, kernel_regularizer=l2(0.01))(lstm_dropout_1)
lstm_dropout_2 = Dropout(0.4)(lstm_2)
dense_lyrics = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(lstm_dropout_2)
dropout_lyrics = Dropout(0.5)(dense_lyrics)

# FFNN model for Numerical Features
features_input = Input(shape=(X_train_features.shape[1],), name='features_input')
dense_features_1 = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(features_input)
dropout_features_1 = Dropout(0.5)(dense_features_1)
dense_features_2 = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(dropout_features_1)
dropout_features_2 = Dropout(0.5)(dense_features_2)

# Merge the outputs of the two branches
merged = concatenate([dropout_lyrics, dropout_features_2])

# Add the final output layer
output = Dense(len(y_train.unique()), activation='softmax')(merged)

# Compile the model
model = Model(inputs=[lyrics_input, features_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train, epochs=500, batch_size=64, validation_data=(X_test, y_test), verbose=1)


In [None]:
import matplotlib.pyplot as plt

# Plot the training history
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Over Epochs')

plt.tight_layout()
plt.show()