In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [1]:
pwd

'C:\\Users\\Karthiek Duggirala'

In [3]:
df = pd.read_csv('C:\\Users\\Karthiek Duggirala\\Downloads\\preprocessed_data.csv')

In [4]:
# split the data into training and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# encode the genre labels using LabelEncoder
le = LabelEncoder()
train_labels = le.fit_transform(train_data['genre'])
test_labels = le.transform(test_data['genre'])

In [6]:
# tokenize the lyrics data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['lyrics'])


In [7]:
# convert the tokenized lyrics data to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['lyrics'])
test_sequences = tokenizer.texts_to_sequences(test_data['lyrics'])

In [8]:
# pad the sequences to ensure uniform length
max_len = 200
train_data = pad_sequences(train_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)

In [9]:
# create word embeddings using GloVe
embedding_dim = 100
embedding_dict = {}
with open("C:\\Users\\Karthiek Duggirala\\Downloads\\glove.6B\\glove.6B.100d.txt", 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

In [10]:
# create the embedding matrix for the training data
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [24]:
# create the CNN model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
# train the model
model.fit(train_data, pd.get_dummies(train_labels).values, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a9dd91e550>

In [27]:
loss, accuracy = model.evaluate(test_data, pd.get_dummies(test_labels).values, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 66.387725


In [15]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

In [16]:
# train the model
model.fit(train_data, pd.get_dummies(train_labels).values, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a9dc72d3d0>

In [17]:
loss, accuracy = model.evaluate(test_data, pd.get_dummies(test_labels).values, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 19.245926


In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# train the model
model.fit(train_data, pd.get_dummies(train_labels).values, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a9dd9e2910>

In [20]:
loss, accuracy = model.evaluate(test_data, pd.get_dummies(test_labels).values, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 65.186733
