### Genre Classification from Lyrics

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
DATA_DIR="data/"
from sklearn.preprocessing import LabelEncoder
import time

In [2]:
with open(DATA_DIR+"lyrics_genre_MSD_1000chunks.pickle","rb") as f:
    data=pickle.load(f)
data=data[['tempo', 'artist_familiarity', 'mode', 'loudness', 'artist_hotttnesss', 'key_confidence', 'end_of_fade_in', 'start_of_fade_out', 'duration', 'time_signature', 'key', 'song_hotttnesss','genre']]
data = data[(data.genre != 'Not Available') & (data.genre != 'Other')]
print(data.shape)
data.head()

(46213, 13)


Unnamed: 0,tempo,artist_familiarity,mode,loudness,artist_hotttnesss,key_confidence,end_of_fade_in,start_of_fade_out,duration,time_signature,key,song_hotttnesss,genre
0,90.008,0.602216,1.0,-11.738,0.349761,0.283,0.136,234.516,258.63791,4.0,10.0,0.516346,Rock
2,137.122,0.835272,1.0,-6.338,0.741724,0.859,0.218,195.802,204.72118,4.0,2.0,0.826612,Rock
3,165.035,0.653299,1.0,-4.635,0.482431,0.571,0.0,206.861,232.98567,4.0,0.0,0.7028,Metal
8,135.065,0.532842,1.0,-6.664,0.407992,0.474,0.356,184.477,190.98077,4.0,9.0,0.604205,Pop
12,163.886,0.834539,0.0,-9.071,0.477311,0.511,0.061,152.868,152.86812,4.0,1.0,0.651125,Hip-Hop


In [3]:
data = pd.DataFrame()
with open('data/lyrics_genre_MSD_1000chunks.pickle', 'rb') as dataFile:
    data = pickle.load(dataFile)
data = data[['genre', 'tempo', 'artist_familiarity', 'mode', 'loudness', 'artist_hotttnesss', 'key_confidence', 'end_of_fade_in', 'start_of_fade_out', 'duration', 'time_signature', 'key', 'song_hotttnesss']]

print('Original data', data.shape)
arr=np.load("data/feature_vectors.npy")
dataf=pd.DataFrame(arr,index=None,columns=None)
print("Lyrics doc2vec data", dataf.shape)

Original data (49658, 13)
Lyrics doc2vec data (49658, 300)


In [4]:
le = LabelEncoder()
data['genre'] = le.fit_transform(data['genre'])        

In [5]:
df=pd.concat([data.reset_index(drop=True),dataf.reset_index(drop=True)],axis=1)
df=df.dropna()
y_data = df[['genre']].values.flatten()
df = df.drop(columns=["genre"])

In [6]:
dmatrix = df.values
dmatrix.shape
df=df.dropna()
df.shape

(34294, 312)

In [7]:
cX = (dmatrix - np.mean(dmatrix, axis=0))/np.std(dmatrix, axis=0)
x_train, x_test, y_train, y_test = train_test_split(cX, y_data, test_size=0.6, random_state=42)

In [8]:
classes = np.unique(y_data)
classes

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

### Classification Models

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier

In [14]:
classifier = LogisticRegression(class_weight="balanced", solver="lbfgs", multi_class='multinomial', max_iter=1000)
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.25647007363126045
Test Score: 0.18919181610536034


In [15]:
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.5494641685499745
Test Score: 0.38834621178986245


In [16]:
classifier = SVC(class_weight="balanced", gamma="scale")
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.5232922650725377
Test Score: 0.2928026437284347


In [None]:
classifier = GradientBoostingClassifier(n_estimators=100)
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

In [17]:
classifier = tree.DecisionTreeClassifier(class_weight="balanced")
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 1.0
Test Score: 0.293726004762599


In [19]:
classifier = RandomForestClassifier(class_weight="balanced", n_estimators=100)
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 1.0
Test Score: 0.4767944792729747


In [20]:
classifier = MLPClassifier(alpha = 1)
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.754683968797842
Test Score: 0.46279826991300965




In [21]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.20952103229569147
Test Score: 0.2075132429411479


In [22]:
classifier = AdaBoostClassifier()
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 0.49019464897572357
Test Score: 0.4794187685279681


In [23]:
classifier = QuadraticDiscriminantAnalysis()
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)



Train Score: 0.7804184588466866
Test Score: 0.41837974437478737


In [None]:
classifier = GaussianProcessClassifier()
classifier.fit(x_train, y_train)
train_score = classifier.score(x_train, y_train)
test_score = classifier.score(x_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)