In [None]:
import numpy as np
import pandas as pd
import keras
import os
import urllib.request
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [1]:
pwd

'C:\\Users\\Karthiek Duggirala\\Downloads'

In [None]:
# Function to download GloVe embeddings file if it's not already present
def download_glove():
    if not os.path.isfile('glove.6B.100d.txt'):
        print('Downloading GloVe embeddings...')
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        urllib.request.urlretrieve(url, 'glove.6B.zip')
        !unzip -q glove.6B.zip glove.6B.100d.txt
        !rm glove.6B.zip

In [None]:
 #Download GloVe embeddings if necessary
download_glove()

Downloading GloVe embeddings...


In [None]:
df = pd.read_csv('preprocessed_data.csv', header = None)

In [None]:
df.columns

Int64Index([0, 1, 2], dtype='int64')

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.dropna()

In [None]:
df.head()

Unnamed: 0,0,1,2
1,0.0,summer high school when we first met we would ...,pop
2,1.0,yeah yeah yeah i can feel phoenix inside me i ...,pop
3,2.0,told them your dream they started i guess you ...,pop
4,3.0,if i lost it today would you stay could my lov...,pop
5,4.0,nice leg daisy duke make man go that is the wa...,pop


In [None]:
df = df.drop(0, axis=1)

In [None]:
df.head()

Unnamed: 0,1,2
1,summer high school when we first met we would ...,pop
2,yeah yeah yeah i can feel phoenix inside me i ...,pop
3,told them your dream they started i guess you ...,pop
4,if i lost it today would you stay could my lov...,pop
5,nice leg daisy duke make man go that is the wa...,pop


In [None]:
df = df.rename(columns={1: "lyrics", 2: "genre"})

In [None]:
# Split the lyrics and genres into separate variables
lyrics = df.iloc[:, 0].values
genre = df.iloc[:, 1].values

In [None]:
# Map string labels to numerical labels
genre_map = {'pop': 0, 'hiphop': 1, 'rock': 2}
genre = np.array([genre_map[label] for label in genre])

In [None]:
# One-hot encode the labels
genre = keras.utils.to_categorical(genre, num_classes=3)

In [None]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(lyrics, genre, test_size=0.2, random_state=42)

In [None]:
# Tokenize the lyrics
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [None]:
# Pad the sequences to have the same length
train_data = pad_sequences(train_sequences, maxlen=200)
test_data = pad_sequences(test_sequences, maxlen=200)

In [None]:
# Load the pre-trained GloVe embeddings
embeddings_index = {}
with open('/content/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Create an embedding matrix for the words in the vocabulary
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=200, trainable=False))
model.add(LSTM(128, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_data, train_labels, validation_split=0.2, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2de84e2ee0>

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_data, test_labels)
print('Test loss:', loss)
print('Test accuracy:', accuracy)


Test loss: 0.690294623374939
Test accuracy: 0.6828606724739075
