In [1]:
import os
import re
import string
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import f1_score

In [11]:
import pandas as pd
import tqdm

df = pd.read_csv('./test.csv')

df.head(5)

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id
0,craftsmanship,2005,buck-65,Hip-Hop,Most folks spend their days daydreaming of fin...,8294
1,come-on-out,2012,the-elwins,Indie,Take your cold hands and put them on my face\n...,21621
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301
3,that-s-what-girls-do,2007,dream-street,Pop,You ask me why I change the color of my hair\n...,2773
4,believe-in-a-dollar,2012,cassidy,Hip-Hop,Do you believe in magic in a young girl's hear...,16797


In [12]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

# Preprocessing
def prepocessing(lyrics, remove_stopwords=False, stops=set(stopwords.words('english'))):
    lyric_text = BeautifulSoup(lyric, "html5lib").get_text()
    lyric_text = re.sub("[^a-zA-Z]", " ", lyric_text)
    lyric_text = lyric_text.lower()
    
    lyric_words = lyric_text.split()
    if remove_stopwords:
        lyric_words = lyric_text.split()
        lyric_words = [w for w in lyric_words if not w in stops]
        
    lyric_text = ' '.join(lyric_words)
    return lyric_text

In [13]:
processed_lyrics = []
for lyric in tqdm.tqdm(df['Lyrics'].values):
    processed = prepocessing(lyric)
    processed_lyrics.append(processed)

100%|████████████████████████████████████████████████████████████████████████████| 7935/7935 [00:04<00:00, 1859.62it/s]


In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

x = df['Lyrics'].values
y = np.array(pd.Categorical(pd.factorize(df['Genre'])[0]))

X_train, X_val, y_train, y_val = train_test_split(
    x, y, test_size=0.3, random_state=42
)

In [9]:
word2vec = tf.keras.models.load_model('word2vec')

In [10]:
BUFFER_SIZE = 10000
BATCH_SIZE = 1024

train_ds = tf.data.Dataset.from_tensor_slices((x, y))
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))   

model = tf.keras.Sequential([
    encoder,
    word2vec.get_layer('w2v_embedding'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [15]:
model.load_weights('./epoch50/variables/variables')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x199dc656f88>

In [32]:
predict = model.predict(x)

In [33]:
predict = (np.argmax(predict, axis=1))

print(predict)

[0 0 1 ... 8 0 2]


In [34]:
y = pd.Categorical(pd.factorize(df['Genre'])[0])

print(np.array(y))

[0 1 2 ... 0 6 6]


In [42]:
f1 = f1_score(np.array(y), predict, average='micro')

In [43]:
f1

0.030749842470069313

In [44]:
df['Predict'] = predict

In [46]:
df['IntGenre'] = y

In [47]:
df

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id,Predict,IntGenre
0,craftsmanship,2005,buck-65,Hip-Hop,Most folks spend their days daydreaming of fin...,8294,0,0
1,come-on-out,2012,the-elwins,Indie,Take your cold hands and put them on my face\n...,21621,0,1
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301,1,2
3,that-s-what-girls-do,2007,dream-street,Pop,You ask me why I change the color of my hair\n...,2773,2,3
4,believe-in-a-dollar,2012,cassidy,Hip-Hop,Do you believe in magic in a young girl's hear...,16797,0,0
...,...,...,...,...,...,...,...,...
7930,too-little-too-late,2006,amanda-marshall,Rock,Tuesday night - 7:30\nI hear a voice on the te...,23453,0,6
7931,berserker,2007,aurora-borealis,Metal,Elite forces cloaked in fur un sensitive to pa...,2724,1,2
7932,natural-born-killaz,2010,dr-dre,Hip-Hop,[Dr. Dre]\nJourney with me\nInto the mind of a...,24147,8,0
7933,wide-awake,2011,chris-cornell,Rock,You can a look a hurricane right in the eye.\n...,4150,0,6
