In [1]:
import os
import re
import string
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import f1_score

In [2]:
import pandas as pd
import tqdm

df = pd.read_csv('./test.csv')

df.head(5)

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id
0,craftsmanship,2005,buck-65,Hip-Hop,Most folks spend their days daydreaming of fin...,8294
1,come-on-out,2012,the-elwins,Indie,Take your cold hands and put them on my face\n...,21621
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301
3,that-s-what-girls-do,2007,dream-street,Pop,You ask me why I change the color of my hair\n...,2773
4,believe-in-a-dollar,2012,cassidy,Hip-Hop,Do you believe in magic in a young girl's hear...,16797


In [3]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

# Preprocessing
def prepocessing(lyrics, remove_stopwords=False, stops=set(stopwords.words('english'))):
    lyric_text = BeautifulSoup(lyric, "html5lib").get_text()
    lyric_text = re.sub("[^a-zA-Z]", " ", lyric_text)
    lyric_text = lyric_text.lower()
    
    lyric_words = lyric_text.split()
    if remove_stopwords:
        lyric_words = lyric_text.split()
        lyric_words = [w for w in lyric_words if not w in stops]
        
    lyric_text = ' '.join(lyric_words)
    return lyric_text

In [4]:
processed_lyrics = []
for lyric in tqdm.tqdm(df['Lyrics'].values):
    processed = prepocessing(lyric)
    processed_lyrics.append(processed)

100%|████████████████████████████████████████████████████████████████████████████| 7935/7935 [00:03<00:00, 2449.07it/s]


In [5]:
num_data = []

train_df = pd.read_csv('./train_filtered.csv')
total = len(train_df.values)

for genre in train_df['Genre'].unique():
    print(genre)
    num_data.append(len(train_df[train_df['Genre'] == genre]))

print(num_data)

class_weight = [
    1 - num_data[0] / total,
    1 - num_data[1] / total,
    1 - num_data[2] / total,
    1 - num_data[3] / total,
    1 - num_data[4] / total,
    1 - num_data[5] / total,
    1 - num_data[6] / total,
    1 - num_data[7] / total,
    1 - num_data[8] / total,
    1 - num_data[9] / total
]

class_code = {
    'Rock': 0,
    'Metal': 1,
    'Pop': 2,
    'Indie': 3,
    'Folk': 4,
    'Electronic': 5,
    'R&B': 6,
    'Jazz': 7,
    'Hip-Hop': 8,
    'Country': 9,
}

Rock
Metal
Pop
Indie
Folk
Electronic
R&B
Jazz
Hip-Hop
Country
[107019, 19098, 86219, 7240, 8165, 2002, 2763, 13314, 2238, 1890]


In [6]:
import numpy as np
from sklearn.model_selection import train_test_split

y = []
for genre in df['Genre']:
    y.append(class_code[genre])

x = processed_lyrics
y = np.array(y)

In [7]:
word2vec = tf.keras.models.load_model('word2vec_exp2')

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 1024

train_ds = tf.data.Dataset.from_tensor_slices((x, y))
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))   

model = tf.keras.Sequential([
    encoder,
    word2vec.get_layer('w2v_embedding'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [9]:
model.load_weights('./exp2-2-3/variables/variables')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2212038d348>

In [9]:
model = tf.keras.models.load_model('./exp2-2-2', custom_objects={'LSTMCell': tf.keras.layers.LSTM})

In [10]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 w2v_embedding (Embedding)   (None, None, 300)         1228800   
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        186880    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                        

In [12]:
predict = model.predict(x)

In [13]:
model.evaluate(np.array(x), y, batch_size=32)



[3.90234112739563, 0.09918084740638733]

In [None]:
weighted_predict = predict * class_weight

In [None]:
print(weighted_predict)

In [None]:
w_predict = (np.argmax(weighted_predict, axis=1))

print(w_predict)

In [14]:
f1 = f1_score(np.array(y), np.argmax(predict, axis=1), average='micro')

In [15]:
f1

0.09918084436042848

In [13]:
df['Predict'] = np.argmax(predict, axis=1)

In [14]:
df['IntGenre'] = y

In [15]:
df

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id,Predict,IntGenre
0,craftsmanship,2005,buck-65,Hip-Hop,Most folks spend their days daydreaming of fin...,8294,1,8
1,come-on-out,2012,the-elwins,Indie,Take your cold hands and put them on my face\n...,21621,1,3
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301,1,1
3,that-s-what-girls-do,2007,dream-street,Pop,You ask me why I change the color of my hair\n...,2773,1,2
4,believe-in-a-dollar,2012,cassidy,Hip-Hop,Do you believe in magic in a young girl's hear...,16797,1,8
...,...,...,...,...,...,...,...,...
7930,too-little-too-late,2006,amanda-marshall,Rock,Tuesday night - 7:30\nI hear a voice on the te...,23453,1,0
7931,berserker,2007,aurora-borealis,Metal,Elite forces cloaked in fur un sensitive to pa...,2724,1,1
7932,natural-born-killaz,2010,dr-dre,Hip-Hop,[Dr. Dre]\nJourney with me\nInto the mind of a...,24147,1,8
7933,wide-awake,2011,chris-cornell,Rock,You can a look a hurricane right in the eye.\n...,4150,1,0


In [16]:
df[df['Predict'] == 0]

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id,Predict,IntGenre


In [29]:
pop = df[df['Genre'] == 'Metal']

In [30]:
pop

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id,Predict,IntGenre
2,riot,2013,bullet-for-my-valentine,Metal,Are you ready it's time for war\nWe'll break d...,3301,2,1
10,the-other-side,2007,carnal-forge,Metal,Broken hopes in a world of lies\nIt makes me b...,26075,2,1
14,impure-massacre-of-bloody-souls,2007,fleshcrawl,Metal,Buried deep into the tomb\nStill alive with bl...,7650,0,1
21,do-you-dream-of-angels,2007,balance-of-power,Metal,Now your sleeping now your still\nSoftly tell ...,11989,0,1
25,after-forever,2007,biohazard,Metal,Yo this is biohazard from brooklyn new york dr...,11518,0,1
...,...,...,...,...,...,...,...,...
7915,carving-the-way,2007,delight,Metal,"Wings of butterfly inside\nMy spirit revives, ...",3440,0,1
7923,scattered-remains-splattered-brains,2006,cannibal-corpse,Metal,Prepare to witness a place of gore\nOf legal d...,17099,2,1
7925,universe,2006,evereve,Metal,Give me the sleep I need to forget\nMy pain an...,4248,0,1
7928,carbonized-eyesockets,2006,carcass,Metal,"The pungent aroma Of hot, bubbling, molten gri...",15109,0,1


In [31]:
pop[pop['Predict'] == pop['IntGenre']]

Unnamed: 0,Song,Song year,Artist,Genre,Lyrics,Track_id,Predict,IntGenre
616,slit-your-guts,2007,cryptopsy,Metal,"Pardon, please, the narrow\nConfinement of you...",20365,1,1
748,intruders,2007,exciter,Metal,"Fighters, igniters\nA storm of intruders\nDefe...",7132,1,1
855,cavalry-call,2011,beekeeper,Metal,so expected\nim in mourning\nwill i forget\nca...,11489,1,1
1418,warmaster,2006,bolt-thrower,Metal,"Throughout all time\nWithin the past, present ...",14534,1,1
1592,smash-or-be-smashed,2007,earth-crisis,Metal,"Swords beaten into rust, but not by all.\nLull...",9990,1,1
1626,house-of-shame,2006,genitortures,Metal,House of Shame.\nWelcome to the House of Shame...,16235,1,1
1635,burnin-leather,2006,bathory,Metal,It's something you can't fake it's all within\...,2658,1,1
1842,odium,2007,cadaver,Metal,Their souls omit from their necks\nHung up our...,1647,1,1
2061,the-storm,2007,aurora-borealis,Metal,Fell the wind blow across the moon lit sky\nSe...,1366,1,1
2270,i-will-destroy-the-wisdom-of-the-wise,2012,a-bullet-for-pretty-boy,Metal,[Intro] x2\nI cant look at what weve done\nAnd...,19357,1,1


# Model evalutate

In [83]:
train_processed_lyrics = []
for lyric in tqdm.tqdm(train_df['Lyrics'][:50000].values):
    train_processed = prepocessing(lyric)
    train_processed_lyrics.append(processed)

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:20<00:00, 2385.65it/s]


In [84]:
train_y = []
for genre in train_df['Genre'][:50000]:
    train_y.append(class_code[genre])

train_x = train_processed_lyrics
train_y = np.array(train_y)

In [85]:
model.load_weights('./epoch50/variables/variables')

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

model.evaluate(np.array(train_x), train_y)



[1.7845633029937744, 0.1628199964761734]

In [12]:
from sklearn.metrics import classification_report

y_pred = model.predict(x, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y, y_pred_bool))

              precision    recall  f1-score   support

           0       0.50      0.00      0.00      1410
           1       0.10      0.88      0.18       810
           2       1.00      0.00      0.00      1110
           3       0.03      0.00      0.00       510
           4       0.00      0.00      0.00       495
           5       0.13      0.08      0.10       660
           6       0.05      0.04      0.05       510
           7       0.00      0.00      0.00       660
           8       0.07      0.00      0.00       960
           9       0.10      0.00      0.00       810

    accuracy                           0.10      7935
   macro avg       0.20      0.10      0.03      7935
weighted avg       0.27      0.10      0.03      7935



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
df = pd.read_csv('./preprocessed_train_data.csv')

processed_lyrics = []
for lyric in tqdm.tqdm(df['Lyrics'].values):
    processed = prepocessing(lyric)
    processed_lyrics.append(processed)
    
y = []
for genre in df['Genre']:
    y.append(class_code[genre])

x = processed_lyrics
y = np.array(y)

100%|████████████████████████████████████████████████████████████████████████| 249943/249943 [01:46<00:00, 2347.73it/s]


In [11]:
from sklearn.metrics import classification_report

y_pred = model.predict(x, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y, y_pred_bool))

              precision    recall  f1-score   support

           0       0.18      0.12      0.14      1410
           1       0.10      0.84      0.17       810
           2       0.25      0.00      0.00      1110
           3       0.00      0.00      0.00       510
           4       0.00      0.00      0.00       495
           5       1.00      0.00      0.00       660
           6       0.00      0.00      0.00       510
           7       0.00      0.00      0.00       660
           8       0.00      0.00      0.00       960
           9       0.00      0.00      0.00       810

    accuracy                           0.11      7935
   macro avg       0.15      0.10      0.03      7935
weighted avg       0.16      0.11      0.04      7935



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
