### Music Genre Classification - Text Classification - NLP Project


#### Import Libraries

In [1]:
import pandas as pd
import numpy as np

# NLP preprocessing libraries
import string
import nltk

import time 
# only to calculate runtime

# count word frequencies to vectors and tf-idf transformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# learning models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# train and test data splitting
from sklearn.model_selection import train_test_split

#### Import Dataset

In [2]:
lyrics = pd.read_csv('lyrics-data.csv')
genres = pd.read_csv('artists-data.csv')

print(lyrics.shape, genres.shape)

lyrics = lyrics.dropna()

lyrics = lyrics[["ALink", "Lyric"]]
genres = genres[["Link", "Genre"]]

new_data = pd.merge(genres, lyrics, left_on="Link", right_on="ALink")
new_data = new_data[["Genre", "Lyric"]]
new_data = new_data.sample(frac = 0.3).reset_index(drop = True)

display(new_data)

(209522, 5) (3242, 6)


Unnamed: 0,Genre,Lyric
0,Hip Hop,Hey... Yeah. This goes out to the (you know). ...
1,Rock,Intro. |------------|-------------------------...
2,Rock,"Every day is so wonderful. And suddenly, it's ..."
3,Rock,"Hey hey come in they come. O.k. boys, take one..."
4,Rock,"Always, send your love to me softly.. Sweetly,..."
...,...,...
67194,Hip Hop,Scrawberry. Scrawberry. Scrawberry. Scrawberry...
67195,Pop,"I think I could like you, I already do.. Feeli..."
67196,Rock,Gosto dela. E vou continuar falando nela. Pens...
67197,Rock,There's two kinds of people that I just can't ...


#### Preprocessing

In [3]:
#preprocessing

#convert text to lowercase
new_data["Lyric_lower"] = new_data["Lyric"].str.lower()

#remove special characters, punctuations
new_data["Lyric_no_spec"] = new_data["Lyric_lower"].str.replace(r'[^a-zA-Z\s]+', '')

display(new_data)

Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec
0,Hip Hop,Hey... Yeah. This goes out to the (you know). ...,hey... yeah. this goes out to the (you know). ...,hey yeah this goes out to the you know verse ...
1,Rock,Intro. |------------|-------------------------...,intro. |------------|-------------------------...,intro xxxxxbf wbar xxxx wine is fine...
2,Rock,"Every day is so wonderful. And suddenly, it's ...","every day is so wonderful. and suddenly, it's ...",every day is so wonderful and suddenly its har...
3,Rock,"Hey hey come in they come. O.k. boys, take one...","hey hey come in they come. o.k. boys, take one...",hey hey come in they come ok boys take one dan...
4,Rock,"Always, send your love to me softly.. Sweetly,...","always, send your love to me softly.. sweetly,...",always send your love to me softly sweetly tur...
...,...,...,...,...
67194,Hip Hop,Scrawberry. Scrawberry. Scrawberry. Scrawberry...,scrawberry. scrawberry. scrawberry. scrawberry...,scrawberry scrawberry scrawberry scrawberry th...
67195,Pop,"I think I could like you, I already do.. Feeli...","i think i could like you, i already do.. feeli...",i think i could like you i already do feelings...
67196,Rock,Gosto dela. E vou continuar falando nela. Pens...,gosto dela. e vou continuar falando nela. pens...,gosto dela e vou continuar falando nela penso ...
67197,Rock,There's two kinds of people that I just can't ...,there's two kinds of people that i just can't ...,theres two kinds of people that i just cant st...


In [4]:
#remove stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

new_data["Lyric_no_stop"] = new_data["Lyric_no_spec"].apply(lambda text: remove_stopwords(text))
display(new_data)

Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec,Lyric_no_stop
0,Hip Hop,Hey... Yeah. This goes out to the (you know). ...,hey... yeah. this goes out to the (you know). ...,hey yeah this goes out to the you know verse ...,hey yeah goes know verse hey light skinned gir...
1,Rock,Intro. |------------|-------------------------...,intro. |------------|-------------------------...,intro xxxxxbf wbar xxxx wine is fine...,intro xxxxxbf wbar xxxx wine fine whiskeys qui...
2,Rock,"Every day is so wonderful. And suddenly, it's ...","every day is so wonderful. and suddenly, it's ...",every day is so wonderful and suddenly its har...,every day wonderful suddenly hard breathe get ...
3,Rock,"Hey hey come in they come. O.k. boys, take one...","hey hey come in they come. o.k. boys, take one...",hey hey come in they come ok boys take one dan...,hey hey come come ok boys take one danny junio...
4,Rock,"Always, send your love to me softly.. Sweetly,...","always, send your love to me softly.. sweetly,...",always send your love to me softly sweetly tur...,always send love softly sweetly turn pages alw...
...,...,...,...,...,...
67194,Hip Hop,Scrawberry. Scrawberry. Scrawberry. Scrawberry...,scrawberry. scrawberry. scrawberry. scrawberry...,scrawberry scrawberry scrawberry scrawberry th...,scrawberry scrawberry scrawberry scrawberry ca...
67195,Pop,"I think I could like you, I already do.. Feeli...","i think i could like you, i already do.. feeli...",i think i could like you i already do feelings...,think could like already feelings grow go away...
67196,Rock,Gosto dela. E vou continuar falando nela. Pens...,gosto dela. e vou continuar falando nela. pens...,gosto dela e vou continuar falando nela penso ...,gosto dela e vou continuar falando nela penso ...
67197,Rock,There's two kinds of people that I just can't ...,there's two kinds of people that i just can't ...,theres two kinds of people that i just cant st...,theres two kinds people cant stand evil hearte...


In [5]:
start_time = time.time()
#Stemming
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

new_data["cleaned_lyrics"] = new_data["Lyric_no_stop"].apply(lambda text: stemming(text))
print("Runtime: {} seconds".format(time.time() - start_time))
display(new_data)

Runtime: 189.5874729156494 seconds


Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec,Lyric_no_stop,cleaned_lyrics
0,Hip Hop,Hey... Yeah. This goes out to the (you know). ...,hey... yeah. this goes out to the (you know). ...,hey yeah this goes out to the you know verse ...,hey yeah goes know verse hey light skinned gir...,hey yeah goe know vers hey light skin girl cur...
1,Rock,Intro. |------------|-------------------------...,intro. |------------|-------------------------...,intro xxxxxbf wbar xxxx wine is fine...,intro xxxxxbf wbar xxxx wine fine whiskeys qui...,intro xxxxxbf wbar xxxx wine fine whiskey quic...
2,Rock,"Every day is so wonderful. And suddenly, it's ...","every day is so wonderful. and suddenly, it's ...",every day is so wonderful and suddenly its har...,every day wonderful suddenly hard breathe get ...,everi day wonder sudden hard breath get insecu...
3,Rock,"Hey hey come in they come. O.k. boys, take one...","hey hey come in they come. o.k. boys, take one...",hey hey come in they come ok boys take one dan...,hey hey come come ok boys take one danny junio...,hey hey come come ok boy take one danni junior...
4,Rock,"Always, send your love to me softly.. Sweetly,...","always, send your love to me softly.. sweetly,...",always send your love to me softly sweetly tur...,always send love softly sweetly turn pages alw...,alway send love soft sweet turn page alway nig...
...,...,...,...,...,...,...
67194,Hip Hop,Scrawberry. Scrawberry. Scrawberry. Scrawberry...,scrawberry. scrawberry. scrawberry. scrawberry...,scrawberry scrawberry scrawberry scrawberry th...,scrawberry scrawberry scrawberry scrawberry ca...,scrawberri scrawberri scrawberri scrawberri ca...
67195,Pop,"I think I could like you, I already do.. Feeli...","i think i could like you, i already do.. feeli...",i think i could like you i already do feelings...,think could like already feelings grow go away...,think could like alreadi feel grow go away you...
67196,Rock,Gosto dela. E vou continuar falando nela. Pens...,gosto dela. e vou continuar falando nela. pens...,gosto dela e vou continuar falando nela penso ...,gosto dela e vou continuar falando nela penso ...,gosto dela e vou continuar falando nela penso ...
67197,Rock,There's two kinds of people that I just can't ...,there's two kinds of people that i just can't ...,theres two kinds of people that i just cant st...,theres two kinds people cant stand evil hearte...,there two kind peopl cant stand evil heart wom...


In [6]:
new_data = new_data[["cleaned_lyrics", "Genre"]]
new_data.columns = ["Lyrics", "Genre"]

display(new_data)

Unnamed: 0,Lyrics,Genre
0,hey yeah goe know vers hey light skin girl cur...,Hip Hop
1,intro xxxxxbf wbar xxxx wine fine whiskey quic...,Rock
2,everi day wonder sudden hard breath get insecu...,Rock
3,hey hey come come ok boy take one danni junior...,Rock
4,alway send love soft sweet turn page alway nig...,Rock
...,...,...
67194,scrawberri scrawberri scrawberri scrawberri ca...,Hip Hop
67195,think could like alreadi feel grow go away you...,Pop
67196,gosto dela e vou continuar falando nela penso ...,Rock
67197,there two kind peopl cant stand evil heart wom...,Rock


#### Model Learning

In [7]:
feature_train, feature_test, target_train, target_test = train_test_split(new_data["Lyrics"], new_data["Genre"], test_size = 0.2)
print(feature_train.shape)
print(target_train.shape)
print(feature_test.shape)
print(target_test.shape)

(53759,)
(53759,)
(13440,)
(13440,)


#### Create bag of words with CountVectorizer

In [8]:
start_time = time.time()
count_vector = CountVectorizer().fit(feature_train.values)

train_words = count_vector.transform(feature_train.values)
test_words = count_vector.transform(feature_test.values)

print(train_words.shape)
print(test_words.shape)
print("Runtime: {} seconds".format(time.time() - start_time))

(53759, 121834)
(13440, 121834)
Runtime: 21.622997999191284 seconds


In [9]:
start_time = time.time()
transformer = TfidfTransformer().fit(train_words)

train_words = transformer.transform(train_words)
test_words = transformer.transform(test_words)

print(train_words.shape)
print(test_words.shape)
print("Runtime: {} seconds".format(time.time() - start_time))

(53759, 121834)
(13440, 121834)
Runtime: 0.4307522773742676 seconds


### Training Model

#### Logistic Regression

In [10]:
start_time = time.time()
log = LogisticRegression(solver="lbfgs", max_iter=400, multi_class="auto").fit(train_words, target_train.values)
log_pred = log.predict(test_words)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 104.34198713302612 seconds


#### LinearSVC

In [11]:
start_time = time.time()
svc = LinearSVC(max_iter=1000).fit(train_words, target_train.values)
svc_pred = svc.predict(test_words)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 6.802098512649536 seconds


#### SGDClassifier

In [12]:
start_time = time.time()
sgd = SGDClassifier(max_iter=1000).fit(train_words, target_train.values)
sgd_pred = sgd.predict(test_words)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 2.083800792694092 seconds


#### Multinomial Naive Bayes

In [13]:
start_time = time.time()
nb = MultinomialNB().fit(train_words, target_train.values)
nb_pred = nb.predict(test_words)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 0.31381845474243164 seconds


#### DecisionTreeClassifier

In [14]:
start_time = time.time()
tree = DecisionTreeClassifier().fit(train_words, target_train.values)
tree_pred = tree.predict(test_words)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 137.36014318466187 seconds


#### Calculate evaluation with metrics

In [16]:
print("Logistic Regression:")
print(accuracy_score(target_test.values, log_pred))
print(classification_report(target_test.values, log_pred))
print(confusion_matrix(target_test.values, log_pred))

print("\nLinearSVC:")
print(accuracy_score(target_test.values, svc_pred))
print(classification_report(target_test.values, svc_pred))
print(confusion_matrix(target_test.values, svc_pred))

print("\nSGDClassifier:")
print(accuracy_score(target_test.values, sgd_pred))
print(classification_report(target_test.values, sgd_pred))
print(confusion_matrix(target_test.values, sgd_pred))

print("\nMultinomialNB:")
print(accuracy_score(target_test.values, nb_pred))
print(classification_report(target_test.values, nb_pred))
print(confusion_matrix(target_test.values, nb_pred))

print("\nDecisionTreeClassifier:")
print(accuracy_score(target_test.values, tree_pred))
print(classification_report(target_test.values, tree_pred))
print(confusion_matrix(target_test.values, tree_pred))

Logistic Regression:
0.6330357142857143
              precision    recall  f1-score   support

Funk Carioca       0.62      0.43      0.51       480
     Hip Hop       0.73      0.53      0.61      1809
         Pop       0.54      0.45      0.49      3704
        Rock       0.66      0.78      0.71      4483
       Samba       0.65      0.46      0.54       887
   Sertanejo       0.64      0.85      0.73      2077

    accuracy                           0.63     13440
   macro avg       0.64      0.58      0.60     13440
weighted avg       0.63      0.63      0.62     13440

[[ 207   25   66   30   22  130]
 [  37  963  468  248   16   77]
 [  46  256 1661 1363   74  304]
 [   0   62  712 3490   51  168]
 [  10   12   70   71  412  312]
 [  32    6  120   83   61 1775]]

LinearSVC:
0.640625
              precision    recall  f1-score   support

Funk Carioca       0.61      0.50      0.55       480
     Hip Hop       0.71      0.59      0.64      1809
         Pop       0.53      0.47 

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Funk Carioca       0.00      0.00      0.00       480
     Hip Hop       0.82      0.26      0.40      1809
         Pop       0.60      0.13      0.22      3704
        Rock       0.55      0.80      0.65      4483
       Samba       0.94      0.03      0.06       887
   Sertanejo       0.37      0.99      0.54      2077

    accuracy                           0.49     13440
   macro avg       0.55      0.37      0.31     13440
weighted avg       0.58      0.49      0.42     13440

[[   0    1   14    8    1  456]
 [   0  471  199  759    0  380]
 [   0   98  485 2189    1  931]
 [   0    5   79 3567    0  832]
 [   0    0    9    4   29  845]
 [   0    0   19   10    0 2048]]

DecisionTreeClassifier:
0.5275297619047619
              precision    recall  f1-score   support

Funk Carioca       0.36      0.38      0.37       480
     Hip Hop       0.56      0.57      0.56      1809
         Pop       0.42      0.42      0.42      37

> From all the accuracy results above, LinearSVC and LogisticRegression are by far the best method for text classification.

Manual learning methods above, when fit to CountVectorizer or TfidfTransformer, might cause data leak<br>
Solution: All-in-One but average speed method: using Pipeline to combine CountVectorizer, TfidfTransformer, and Learning Model

In [17]:
from sklearn.pipeline import Pipeline

start_time = time.time()
pipeline = Pipeline([
    ('count_vector', CountVectorizer()),
    ('tfidf_trans', TfidfTransformer()),
    ('model', 
#                 MultinomialNB()
#                 LogisticRegression(solver = "lbfgs", multi_class = "auto", max_iter = 500)
#                 SGDClassifier()
                LinearSVC()
#                 DecisionTreeClassifier()
)])

# From all methods above, we take only 2 of the highest accuracy: 1st. LinearSVC
#                                                                 2nd. LogisticRegression
#                                                                 3rd. SGDClassifier
#                                                                 4th. DecisionTreeClassifier
#                                                                 5th. MultinomialNB


pipeline.fit(feature_train.values, target_train.values)

predicted = pipeline.predict(feature_test.values)
print("Runtime: {} seconds".format(time.time() - start_time))

Runtime: 19.21915364265442 seconds


#### Calculate evaluation with metrics

In [18]:
print(accuracy_score(target_test, predicted))
print(classification_report(target_test, predicted))
print(confusion_matrix(target_test, predicted))

0.640625
              precision    recall  f1-score   support

Funk Carioca       0.61      0.50      0.55       480
     Hip Hop       0.71      0.59      0.64      1809
         Pop       0.53      0.47      0.50      3704
        Rock       0.67      0.76      0.71      4483
       Samba       0.63      0.54      0.58       887
   Sertanejo       0.69      0.82      0.75      2077

    accuracy                           0.64     13440
   macro avg       0.64      0.61      0.62     13440
weighted avg       0.64      0.64      0.63     13440

[[ 241   33   64   22   18  102]
 [  35 1060  433  205   25   51]
 [  58  300 1725 1289   89  243]
 [   5   87  820 3393   60  118]
 [  12    8   72   61  480  254]
 [  47   10  134   88   87 1711]]


### Music Genre Classification - Text Classification - NLP 
