In [59]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from collections import defaultdict
import pprint
import re
from gensim import models
from scipy.sparse import lil_matrix, hstack, csr_matrix
import gensim.downloader as api

# Data Preprocessing
In this section, we preprocess the data and transform raw text data to matrix form. Then, all data is divided into training set and test set. After that, a dictionary is built upon training set.

In [60]:
def specific_preprocess(doc):
    return simple_preprocess(doc,min_len=1)

def remove_specific_words(s):
    s = re.sub(r"\bLyrics\[.+\]"," ",s)
    s = re.sub(r"\[.+\]"," ",s)
    return s

df = pd.read_csv("data/billboard_lyrics_genres.csv")

df["lyrics"] = df["lyrics"].map(remove_specific_words)
df["lyrics"] = df["lyrics"].map(remove_stopwords)
df["lyrics"] = df["lyrics"].map(specific_preprocess)

Then delete the songs that are not English

In [61]:
def isEnglish(w):
    return w.encode("utf-8").isalpha()

def isListEnglish(L):
    return all(map(isEnglish,L))

df["isEnglish"] = df["lyrics"].map(isListEnglish)
df = df.loc[df["isEnglish"],:]

Similarly, perform the same procedure to genre

In [62]:
def remove_pun(s):
    s = re.sub(r"\[\'"," ",s)
    s = re.sub(r"\'\]"," ",s)
    s = re.sub(r"\'"," ",s)
    s = re.sub(r"\[\]"," ",s)
    s = re.sub(r"\,"," ",s)
    s = s.split()
    s = [token.lower() for token in s]
    return s


df["genre"] = df["genre"].map(remove_pun)

In [63]:
freq_gen = defaultdict(int)
for text in df["genre"]:
    for token in text:
        freq_gen[token] += 1

processed_corpus_gen = [[token for token in text if freq_gen[token]>20] for text in df.loc[:,"genre"]]
dict_gen = corpora.Dictionary(processed_corpus_gen)
freq_wanted = {k: v for k,v in freq_gen.items() if v > 100}
pprint.pprint(freq_wanted)

{'alternative': 146,
 'and': 157,
 'country': 416,
 'dance-pop': 155,
 'disco': 147,
 'folk': 141,
 'funk': 170,
 'hard': 102,
 'hip': 432,
 'hop': 374,
 'new': 168,
 'pop': 1381,
 'r&b': 665,
 'rap': 114,
 'rock': 1606,
 'roll': 111,
 'soft': 322,
 'soul': 476,
 'wave': 109}


In this way, we can sort out the genre we want is alternative, country, dance, disco, folk, funk, hip-hop, new wave, pop, r&b, rap, rock, soul (soft stands for soft rock)

In [64]:
gen_des = ["alternative","country","dance","disco","folk","funk","hip","new","pop","r&b","rap","rock","soul"]
gen_des = sorted(gen_des)

# Compute number of columns from maximum word ID in the training data
num_cols = len(gen_des)
dat_gen = lil_matrix((len(df), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    for word in row["genre"]:
        for k in range(len(gen_des)):
            if re.search(gen_des[k],word):
                dat_gen[i,k] = 1
            else:
                dat_gen[i,k] = 0
df[df["genre"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_gen = pd.DataFrame.sparse.from_spmatrix(dat_gen)

Then, we should tag the data for classification.

In [65]:
df["label"] = np.zeros(df.shape[0])

bins = [1970,1980,1990,2000,2010,np.inf]

labels = [0,1,2,3,4,5]

df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3],
                                                          np.where(df["year"] < bins[4], labels[4], labels[5])))))

Then, data is split to training set and test set.

In [66]:
np.random.seed(515)
idx = np.repeat(range(10),len(df.iloc[:,0])//10+1)
df["idx"] = np.random.choice(idx[range(len(df.iloc[:,0]))],size=len(df.iloc[:,0]))
df_train = df.loc[df["idx"]!=0,:]
df_test = df.loc[df["idx"]==0,:]

Build a dictionary based on training set.

In [67]:
freq = defaultdict(int)
for text in df_train["lyrics"]:
    for token in text:
        freq[token] += 1

processed_corpus = [[token for token in text if freq[token]>20] for text in df_train.loc[:,"lyrics"]]
dictionary = corpora.Dictionary(processed_corpus)
df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


In [68]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_train = lil_matrix((len(df_train), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"]]
    values = [value for _, value in row["freq_count"]]
    dat_train[i, indices] = values
df_train[df_train["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_train = pd.DataFrame.sparse.from_spmatrix(dat_train)

Then, perform the same procedure to test set with the dictionary.

In [69]:
df_test = df.loc[df["idx"]==0,:]
processed_corpus = [[token for token in text if freq[token]>20] for text in df_test.loc[:,"lyrics"]]
df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_test = lil_matrix((len(df_test), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"] if count < num_cols]
    values = [value for count, value in row["freq_count"] if count < num_cols and value!=0]
    dat_test[i, indices] = values
df_test[df_test["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_test = pd.DataFrame.sparse.from_spmatrix(dat_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


# Perform TF-IDF

In [70]:
bow_corpus = list(df_train["freq_count"])
tfidf = models.TfidfModel(bow_corpus)
df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]


In [71]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_train = lil_matrix((len(df_train), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"]]
    values = [value for _, value in row["tfidf"]]
    dat_tfidf_train[i, indices] = values
df_train[df_train["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_train = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_train)

In [72]:
df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_test = lil_matrix((len(df_test), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"] if count < num_cols]
    values = [value for count, value in row["tfidf"] if count < num_cols and value != 0]
    dat_tfidf_test[i, indices] = values
df_test[df_test["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_test = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]


# Processed Data
The data processed are diveded into the blow categories:

Original word frequency + genre

TF-IDF word frequency + genre


In [73]:
dat_gen = dat_gen.reset_index()
df = df.reset_index(drop=True)
dat_gen_train = dat_gen.loc[df["idx"]!=0,:].reset_index(drop=True)
dat_gen_test = dat_gen.loc[df["idx"]==0,:].reset_index(drop=True)

In [74]:
train_label = csr_matrix(df_train.loc[:,"label"]).transpose()
test_label = csr_matrix(df_test.loc[:,"label"]).transpose()

gen_train = csr_matrix(dat_gen_train.loc[:,0:])
lyrics_train = csr_matrix(dat_train.loc[:,0:])
data_train = hstack([gen_train, lyrics_train,train_label])
data_train = pd.DataFrame.sparse.from_spmatrix(data_train)

gen_test = csr_matrix(dat_gen_test.loc[:,0:])
lyrics_test = csr_matrix(dat_test.loc[:,0:])
data_test = hstack([gen_test, lyrics_test,test_label])
data_test = pd.DataFrame.sparse.from_spmatrix(data_test)


lyrics_tfidf_train = csr_matrix(dat_tfidf_train.loc[:,0:])
data_tfidf_train = hstack([gen_train,lyrics_tfidf_train,train_label])
data_tfidf_train = pd.DataFrame.sparse.from_spmatrix(data_tfidf_train)

lyrics_tfidf_test = csr_matrix(dat_tfidf_test.loc[:,0:])
data_tfidf_test = hstack([gen_test,lyrics_tfidf_test,test_label])
data_tfidf_test = pd.DataFrame.sparse.from_spmatrix(data_tfidf_test)

In [75]:
word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
word_name = gen_des + word_name + ['label']
data_tfidf_test.columns = word_name
data_tfidf_train.columns = word_name
data_train.columns = word_name
data_test.columns = word_name

In [76]:
# data_tfidf_train.to_csv("data/train_tfidf_data.csv")
# data_tfidf_test.to_csv("data/test_tfidf_data.csv")
# data_train = hstack([lyrics_train,train_label])
# data_train = pd.DataFrame.sparse.from_spmatrix(data_train)
# data_test = hstack([lyrics_test,test_label])
# data_test = pd.DataFrame.sparse.from_spmatrix(data_test)
# word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
# word_name = word_name+['label']
# data_train.columns = word_name
# data_test.columns = word_name

# data_train.to_csv("data/train_data_all.csv")
# data_test.to_csv("data/test_data_all.csv")

  data_tfidf_train.to_csv("data/train_tfidf_data.csv")
  data_tfidf_test.to_csv("data/test_tfidf_data.csv")


# Logistic Regression

In [112]:
from sklearn.linear_model import LogisticRegression

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.3540772532188841
4.954935622317596


Direct Use of Multinomial Logistic Regression's Performance is very bad.

Hence, we consider here multi logistic scenario.

In [156]:
# train models
label_60 = df_train["label"]==0
mr60 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
label_70 = df_train["label"]==1
mr70 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
label_80 = df_train["label"]==2
mr80 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
label_90 = df_train["label"]==3
mr90 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
label_00 = df_train["label"]==4
mr00 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
label_10 = df_train["label"]==5
mr10 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)

def predict_multi_logit(df):
    prob60 = mr60.predict_log_proba(df)[:,1]
    prob70 = mr70.predict_log_proba(df)[:,1]
    prob80 = mr80.predict_log_proba(df)[:,1]
    prob90 = mr90.predict_log_proba(df)[:,1]
    prob00 = mr00.predict_log_proba(df)[:,1]
    prob10 = mr10.predict_log_proba(df)[:,1]
    prob = pd.DataFrame([prob60,prob70,prob80,prob90,prob00,prob10])
    return prob.apply(np.argmax,axis=0)

In [157]:
pred = predict_multi_logit(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

In [166]:
df_test["label"]

24      0
45      0
49      0
65      0
66      0
       ..
5303    5
5349    5
5400    5
5404    5
5419    5
Name: label, Length: 466, dtype: int32

In [169]:
print(sum(np.array(pred) == np.array(df_test["label"]))/len(pred))

0.3540772532188841


In [135]:
label_60 = df_train["label"]==0
label_60_test = df_test["label"]==0

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_60_test)/len(pred))

0.8218884120171673


In [137]:
label_70 = df_train["label"]==1
label_70_test = df_test["label"]==1

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_70_test)/len(pred))

0.8347639484978541


In [138]:
label_80 = df_train["label"]==2
label_80_test = df_test["label"]==2

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_80_test)/len(pred))

0.8540772532188842


In [139]:
label_90 = df_train["label"]==3
label_90_test = df_test["label"]==3

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_90_test)/len(pred))

0.8326180257510729


In [143]:
label_00 = df_train["label"]==4
label_00_test = df_test["label"]==4

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_00_test)/len(pred))

0.8583690987124464


In [148]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_10_test)/len(pred))

0.8283261802575107


In [151]:
mr.predict_log_proba(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])[:,1]

array([-1.05760951, -1.05760951, -1.05760951, -1.05760951, -1.84959223,
       -2.15934671, -1.05760951, -2.15934671, -4.62434577, -1.05760951,
       -1.45467241, -2.54483324, -1.05760951, -1.05760951, -1.45467241,
       -1.45467241, -1.45467241, -1.05760951, -3.24836749, -1.05760951,
       -1.84959223, -1.45467241, -2.15934671, -1.45467241, -1.05760951,
       -3.40862264, -3.40862264, -3.24836749, -1.05760951, -1.05760951,
       -1.05760951, -3.40862264, -1.05760951, -3.24836749, -1.05760951,
       -2.33079205, -3.40862264, -1.05760951, -3.24836749, -2.15934671,
       -1.05760951, -2.15934671, -3.24836749, -1.05760951, -4.62434577,
       -2.15934671, -3.40862264, -3.40862264, -2.33079205, -3.40862264,
       -1.05760951, -3.40862264, -1.05760951, -3.40862264, -3.40862264,
       -3.40862264, -3.40862264, -3.40862264, -2.15934671, -2.54483324,
       -2.15934671, -2.15934671, -2.33079205, -3.40862264, -1.84959223,
       -2.15934671, -1.05760951, -3.20726203, -2.15934671, -3.40

In [146]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == label_10_test)/len(pred))

0.8283261802575107


In [113]:
mr = LogisticRegression(penalty='l1',solver='liblinear').fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.3540772532188841
4.954935622317596


In [114]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.3540772532188841
4.954935622317596


In [115]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.17381974248927037
9.17596566523605


In [116]:
mr = LogisticRegression(penalty='l1',solver="liblinear").fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.39914163090128757
2.847639484978541


In [120]:
from sklearn.svm import SVC 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(with_mean=False),SVC(gamma='auto'))
clf.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = clf.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.17381974248927037
9.169527896995708


In [121]:
pred = clf.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.38412017167381973
2.944206008583691


In [133]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
krr = KernelRidge(kernel="rbf")
krr.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = krr.predict(data_test.iloc[:,:(data_test.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.0
1.9715226190092192


In [134]:
pred

array([ 7.58669991e-01,  6.33187301e-01,  1.25089401e+00,  6.05644719e-01,
        2.85934843e+00,  7.29419154e-01,  1.50625893e+00,  9.76500843e-01,
        2.39299635e-01,  2.19459650e+00,  9.45028864e-01,  4.99883965e-01,
        1.41371816e+00,  2.18710075e+00,  8.19708537e-01,  6.58455754e-01,
        2.04044767e+00,  8.99073823e-01,  3.61597019e-01,  1.61709703e+00,
        9.51757965e-01,  1.28238660e+00,  2.87098267e-01,  2.98056482e+00,
        1.25377523e+00,  7.61212906e-01,  4.61243924e-01,  2.44101023e+00,
        1.39601197e+00,  1.61628588e+00,  8.68157277e-01,  8.21974898e-01,
        6.49968742e-01,  1.16604946e+00,  1.07099592e+00,  9.73266820e-01,
        1.56799796e+00,  2.14615292e+00,  1.13195589e+00,  8.44978386e-01,
        1.79716250e+00,  1.77871976e+00,  9.02608821e-01,  9.80913511e-01,
        1.56471308e+00,  8.95150682e-01,  1.86381207e+00,  6.11169134e-01,
        2.00341177e+00,  1.21429975e+00,  1.58086962e+00,  7.51002094e-01,
        2.11713234e+00,  