In [77]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from collections import defaultdict
import pprint
import re
from gensim import models
from scipy.sparse import lil_matrix, hstack, csr_matrix, vstack
import gensim.downloader as api
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases

In [78]:
def specific_preprocess(doc):
    return simple_preprocess(doc,min_len=1)

def remove_specific_words(s):
    s = re.sub(r"\bLyrics"," ",s)
    s = re.sub(r"\[.+\]"," ",s)
    s = re.sub(r"\b\d+\b Contributors"," ",s)
    s = re.sub(r"Embed"," ",s)
    return s

def count_space(s):
    return s.count(' ')

def remove_short_words(s):
    s = re.sub(r"\b..\b"," ",s)
    s = re.sub(r"\b . \b"," ",s)
    pronoun = [r"\b you\b",r"\b yours\b",r"\b him \b",r"\b his\b", r"\b she \b", r"\b her \b", r"\b hers\b",
               r"\b they \b", r"\b them \b", r"\b their \b", r"\b theirs \b",r"\b You\b",r"\b Yours\b",
               r"\b Him \b",r"\b His\b", r"\b She \b", r"\b Her \b", r"\b Hers\b",
               r"\b They \b", r"\b Them \b", r"\b Their \b", r"\b Theirs \b"]
    conj = [r"\b and \b", r"\b then \b",r"\b for\b", r"\b from\b", r"\b with\b",
            r"\b about\b",r"\b And \b", r"\b Then \b",r"\b For\b", r"\b From\b", r"\b With\b",
            r"\b About\b"]
    for word in pronoun:
        s = re.sub(word," ",s)
    for word in conj:
        s = re.sub(word," ",s)
    return s

df = pd.read_csv("data/billboard_lyrics_genres.csv")
df_activeyear = pd.read_csv("data/first_active_years.csv")
df_activeyear = df_activeyear.drop_duplicates(subset=["band_singer","title","year"],ignore_index=True)
df["active_years"] = 0

for i in range(df.shape[0]):
    ay_tmp = df_activeyear.loc[(df_activeyear["band_singer"]==df.loc[i,"band_singer"])&(df_activeyear["title"]==df.loc[i,"title"])&(df_activeyear["year"]==df.loc[i,"year"])].active_years
    if not ay_tmp.empty:
        df.loc[i,"active_years"] = int(ay_tmp)

df_tmp = df.loc[df["active_years"]!=0].reset_index(drop=True)

df["numword"] = df["lyrics"].map(count_space)
df["lyrics"] = df["lyrics"].map(remove_specific_words)
df["lyrics"] = df["lyrics"].map(remove_stopwords)
df["lyrics"] = df["lyrics"].map(specific_preprocess)


  df.loc[i,"active_years"] = int(ay_tmp)


In [79]:
def isEnglish(w):
    return w.encode("utf-8").isalpha()

def isListEnglish(L):
    return all(map(isEnglish,L))

df["isEnglish"] = df["lyrics"].map(isListEnglish)
df = df.loc[df["isEnglish"],:]

In [80]:
# lemmatizer = WordNetLemmatizer()
# def lemmatize(L):
#     res = list(map(lemmatizer.lemmatize,L))
#     return res
# df["lyrics"]=df["lyrics"].map(lemmatize)

# docs = list(df["lyrics"])
# bigram = Phrases(docs,min_count=20)
# for idx in range(len(docs)):
#     for token in bigram[docs[idx]]:
#         if '_' in token:
#             docs[idx].append(token)

# df["lyrics"] = docs

In [81]:
def remove_pun(s):
    s = re.sub(r"\[\'"," ",s)
    s = re.sub(r"\'\]"," ",s)
    s = re.sub(r"\'"," ",s)
    s = re.sub(r"\[\]"," ",s)
    s = re.sub(r"\,"," ",s)
    s = s.split()
    s = [token.lower() for token in s]
    return s


df["genre"] = df["genre"].map(remove_pun)

In [82]:
gen_des = ["alternative","country","dance","disco","folk","funk","hip","new","pop","r&b","rap","rock","soul"]
gen_des = sorted(gen_des)

# Compute number of columns from maximum word ID in the training data
num_cols = len(gen_des)
dat_gen = lil_matrix((len(df), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    for word in row["genre"]:
        for k in range(len(gen_des)):
            if re.search(gen_des[k],word):
                dat_gen[i,k] = 1
df[df["genre"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_gen = pd.DataFrame.sparse.from_spmatrix(dat_gen)

In [65]:
df["label"] = np.zeros(df.shape[0])

bins = [1970,1980,1990,2000,2010,np.inf]

labels = [0,1,2,3,4,5]

df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3],
                                                          np.where(df["year"] < bins[4], labels[4], labels[5])))))

In [84]:
df["label"] = np.zeros(df.shape[0])

bins = [1968,1978,1990,np.inf]

labels = [0,1,2,3,4]

df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3], labels[4]))))

In [85]:
np.random.seed(515)
idx = np.repeat(range(10),len(df.iloc[:,0])//10+1)
df["idx"] = np.random.choice(idx[range(len(df.iloc[:,0]))],size=len(df.iloc[:,0]))
df_train = df.loc[df["idx"]!=0,:]
df_test = df.loc[df["idx"]==0,:]

In [86]:
from gensim.corpora import Dictionary
dict = Dictionary(df_train["lyrics"])
dict.filter_extremes(no_below=20,no_above=1)

df_train["lyrics"] = [dict.doc2bow(doc) for doc in df_train["lyrics"]]
df_test["lyrics"] = [dict.doc2bow(doc) for doc in df_test["lyrics"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["lyrics"] = [dict.doc2bow(doc) for doc in df_train["lyrics"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["lyrics"] = [dict.doc2bow(doc) for doc in df_test["lyrics"]]


In [88]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dict.keys())+1
dat_train = lil_matrix((len(df_train), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["lyrics"]]
    values = [value for _, value in row["lyrics"]]
    dat_train[i, indices] = values
df_train[df_train["lyrics"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_train = pd.DataFrame.sparse.from_spmatrix(dat_train)

In [89]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dict.keys())+1
dat_test = lil_matrix((len(df_test), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["lyrics"] if count < num_cols]
    values = [value for count, value in row["lyrics"] if count < num_cols and value!=0]
    dat_test[i, indices] = values
df_test[df_test["lyrics"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_test = pd.DataFrame.sparse.from_spmatrix(dat_test)

In [91]:
dat_gen = dat_gen.reset_index()
df = df.reset_index(drop=True)
dat_gen_train = dat_gen.loc[df["idx"]!=0,:].reset_index(drop=True)
dat_gen_test = dat_gen.loc[df["idx"]==0,:].reset_index(drop=True)

In [92]:
train_ones = csr_matrix(np.zeros(df_train.shape[0])).transpose()
test_ones = csr_matrix(np.zeros(df_test.shape[0])).transpose()
train_label = csr_matrix(df_train.loc[:,"label"]).transpose()
test_label = csr_matrix(df_test.loc[:,"label"]).transpose()
train_numword = csr_matrix(df_train.loc[:,"numword"]).transpose()
test_numword = csr_matrix(df_test.loc[:,"numword"]).transpose()
train_activeyear = csr_matrix(df_train.loc[:,"active_years"]).transpose()
test_activeyear = csr_matrix(df_test.loc[:,"active_years"]).transpose()

gen_train = csr_matrix(dat_gen_train.loc[:,0:])
lyrics_train = csr_matrix(dat_train.loc[:,0:])
data_train = hstack([train_ones,gen_train, lyrics_train,train_numword,train_activeyear,train_label])
data_train = pd.DataFrame.sparse.from_spmatrix(data_train)

gen_test = csr_matrix(dat_gen_test.loc[:,0:])
lyrics_test = csr_matrix(dat_test.loc[:,0:])
data_test = hstack([test_ones,gen_test, lyrics_test,test_numword,test_activeyear,test_label])
data_test = pd.DataFrame.sparse.from_spmatrix(data_test)

In [93]:
from sklearn.preprocessing import MaxAbsScaler

transformer = MaxAbsScaler()
transformer.fit(data_train.iloc[:,:(data_train.shape[1]-1)])
data_train_scaled = transformer.transform(data_train.iloc[:,:(data_train.shape[1]-1)])
data_test_scaled = transformer.transform(data_test.iloc[:,:(data_train.shape[1]-1)])
data_train_scaled = hstack([data_train_scaled,train_label])
data_test_scaled = hstack([data_test_scaled,test_label])
data_train_scaled = pd.DataFrame.sparse.from_spmatrix(data_train_scaled)
data_test_scaled = pd.DataFrame.sparse.from_spmatrix(data_test_scaled)

In [94]:
word_name = [dict[i] for i in range(max(dict.keys())+1)]
word_name = ['intercept']+gen_des + word_name +['numword']+['active_years']+ ['label']
data_train.columns = word_name
data_test.columns = word_name

In [96]:
from sklearn.linear_model import LogisticRegression

mr = LogisticRegression(penalty='l2',solver="liblinear",max_iter=4000).fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))

0.5579399141630901
