In [14]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from collections import defaultdict
import pprint
import re
from gensim import models
from scipy.sparse import lil_matrix, hstack, csr_matrix, vstack
import gensim.downloader as api

# Data Preprocessing
In this section, we preprocess the data and transform raw text data to matrix form. Then, all data is divided into training set and test set. After that, a dictionary is built upon training set.

In [15]:
def specific_preprocess(doc):
    return simple_preprocess(doc,min_len=1)

def remove_specific_words(s):
    s = re.sub(r"\bLyrics\[.+\]"," ",s)
    s = re.sub(r"\[.+\]"," ",s)
    return s

df = pd.read_csv("data/billboard_lyrics_genres.csv")

df["lyrics"] = df["lyrics"].map(remove_specific_words)
df["lyrics"] = df["lyrics"].map(remove_stopwords)
df["lyrics"] = df["lyrics"].map(specific_preprocess)

Then delete the songs that are not English

In [16]:
def isEnglish(w):
    return w.encode("utf-8").isalpha()

def isListEnglish(L):
    return all(map(isEnglish,L))

df["isEnglish"] = df["lyrics"].map(isListEnglish)
df = df.loc[df["isEnglish"],:]

Similarly, perform the same procedure to genre

In [17]:
def remove_pun(s):
    s = re.sub(r"\[\'"," ",s)
    s = re.sub(r"\'\]"," ",s)
    s = re.sub(r"\'"," ",s)
    s = re.sub(r"\[\]"," ",s)
    s = re.sub(r"\,"," ",s)
    s = s.split()
    s = [token.lower() for token in s]
    return s


df["genre"] = df["genre"].map(remove_pun)

In [18]:
freq_gen = defaultdict(int)
for text in df["genre"]:
    for token in text:
        freq_gen[token] += 1

processed_corpus_gen = [[token for token in text if freq_gen[token]>20] for text in df.loc[:,"genre"]]
dict_gen = corpora.Dictionary(processed_corpus_gen)
freq_wanted = {k: v for k,v in freq_gen.items() if v > 100}
pprint.pprint(freq_wanted)

{'alternative': 146,
 'and': 157,
 'country': 416,
 'dance-pop': 155,
 'disco': 147,
 'folk': 141,
 'funk': 170,
 'hard': 102,
 'hip': 432,
 'hop': 374,
 'new': 168,
 'pop': 1381,
 'r&b': 665,
 'rap': 114,
 'rock': 1606,
 'roll': 111,
 'soft': 322,
 'soul': 476,
 'wave': 109}


In this way, we can sort out the genre we want is alternative, country, dance, disco, folk, funk, hip-hop, new wave, pop, r&b, rap, rock, soul (soft stands for soft rock)

In [19]:
gen_des = ["alternative","country","dance","disco","folk","funk","hip","new","pop","r&b","rap","rock","soul"]
gen_des = sorted(gen_des)

# Compute number of columns from maximum word ID in the training data
num_cols = len(gen_des)
dat_gen = lil_matrix((len(df), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    for word in row["genre"]:
        for k in range(len(gen_des)):
            if re.search(gen_des[k],word):
                dat_gen[i,k] = 1
df[df["genre"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_gen = pd.DataFrame.sparse.from_spmatrix(dat_gen)

Then, we should tag the data for classification.

In [20]:
df["label"] = np.zeros(df.shape[0])

bins = [1970,1980,1990,2000,2010,np.inf]

labels = [0,1,2,3,4,5]

df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3],
                                                          np.where(df["year"] < bins[4], labels[4], labels[5])))))

Then, data is split to training set and test set.

In [21]:
np.random.seed(515)
idx = np.repeat(range(10),len(df.iloc[:,0])//10+1)
df["idx"] = np.random.choice(idx[range(len(df.iloc[:,0]))],size=len(df.iloc[:,0]))
df_train = df.loc[df["idx"]!=0,:]
df_test = df.loc[df["idx"]==0,:]

Build a dictionary based on training set.

In [22]:
freq = defaultdict(int)
for text in df_train["lyrics"]:
    for token in text:
        freq[token] += 1

processed_corpus = [[token for token in text if freq[token]>20] for text in df_train.loc[:,"lyrics"]]
dictionary = corpora.Dictionary(processed_corpus)
df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


In [23]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_train = lil_matrix((len(df_train), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"]]
    values = [value for _, value in row["freq_count"]]
    dat_train[i, indices] = values
df_train[df_train["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_train = pd.DataFrame.sparse.from_spmatrix(dat_train)

Then, perform the same procedure to test set with the dictionary.

In [24]:
df_test = df.loc[df["idx"]==0,:]
processed_corpus = [[token for token in text if freq[token]>20] for text in df_test.loc[:,"lyrics"]]
df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_test = lil_matrix((len(df_test), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"] if count < num_cols]
    values = [value for count, value in row["freq_count"] if count < num_cols and value!=0]
    dat_test[i, indices] = values
df_test[df_test["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_test = pd.DataFrame.sparse.from_spmatrix(dat_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


# Perform TF-IDF

In [25]:
bow_corpus = list(df_train["freq_count"])
tfidf = models.TfidfModel(bow_corpus)
df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]


In [26]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_train = lil_matrix((len(df_train), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"]]
    values = [value for _, value in row["tfidf"]]
    dat_tfidf_train[i, indices] = values
df_train[df_train["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_train = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_train)

In [27]:
df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_test = lil_matrix((len(df_test), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"] if count < num_cols]
    values = [value for count, value in row["tfidf"] if count < num_cols and value != 0]
    dat_tfidf_test[i, indices] = values
df_test[df_test["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_test = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]


# Processed Data
The data processed are diveded into the blow categories:

Original word frequency + genre

TF-IDF word frequency + genre


In [28]:
dat_gen = dat_gen.reset_index()
df = df.reset_index(drop=True)
dat_gen_train = dat_gen.loc[df["idx"]!=0,:].reset_index(drop=True)
dat_gen_test = dat_gen.loc[df["idx"]==0,:].reset_index(drop=True)

In [29]:
train_label = csr_matrix(df_train.loc[:,"label"]).transpose()
test_label = csr_matrix(df_test.loc[:,"label"]).transpose()

gen_train = csr_matrix(dat_gen_train.loc[:,0:])
lyrics_train = csr_matrix(dat_train.loc[:,0:])
data_train = hstack([gen_train, lyrics_train,train_label])
data_train = pd.DataFrame.sparse.from_spmatrix(data_train)

gen_test = csr_matrix(dat_gen_test.loc[:,0:])
lyrics_test = csr_matrix(dat_test.loc[:,0:])
data_test = hstack([gen_test, lyrics_test,test_label])
data_test = pd.DataFrame.sparse.from_spmatrix(data_test)


lyrics_tfidf_train = csr_matrix(dat_tfidf_train.loc[:,0:])
data_tfidf_train = hstack([gen_train,lyrics_tfidf_train,train_label])
data_tfidf_train = pd.DataFrame.sparse.from_spmatrix(data_tfidf_train)

lyrics_tfidf_test = csr_matrix(dat_tfidf_test.loc[:,0:])
data_tfidf_test = hstack([gen_test,lyrics_tfidf_test,test_label])
data_tfidf_test = pd.DataFrame.sparse.from_spmatrix(data_tfidf_test)

In [30]:
word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
word_name = gen_des + word_name + ['label']
data_tfidf_test.columns = word_name
data_tfidf_train.columns = word_name
data_train.columns = word_name
data_test.columns = word_name

In [31]:
data_tfidf_train.to_csv("data/train_tfidf_data.csv")
data_tfidf_test.to_csv("data/test_tfidf_data.csv")
data_train = hstack([lyrics_train,train_label])
data_train = pd.DataFrame.sparse.from_spmatrix(data_train)
data_test = hstack([lyrics_test,test_label])
data_test = pd.DataFrame.sparse.from_spmatrix(data_test)
word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
word_name = word_name+['label']
data_train.columns = word_name
data_test.columns = word_name

data_train.to_csv("data/train_data_all.csv")
data_test.to_csv("data/test_data_all.csv")
df_train.to_csv("data/train_other.csv")

  data_tfidf_train.to_csv("data/train_tfidf_data.csv")
  data_tfidf_test.to_csv("data/test_tfidf_data.csv")
  data_train.to_csv("data/train_data_all.csv")
  data_test.to_csv("data/test_data_all.csv")


# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.4721030042918455
2.9527896995708156


Direct Use of Multinomial Logistic Regression's Performance is very bad.

Hence, we consider here multi logistic scenario.

In [20]:
# train models
label_60 = df_train["label"]==0
mr60 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
label_70 = df_train["label"]==1
mr70 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
label_80 = df_train["label"]==2
mr80 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
label_90 = df_train["label"]==3
mr90 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
label_00 = df_train["label"]==4
mr00 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
label_10 = df_train["label"]==5
mr10 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)

def predict_multi_logit(df):
    prob60 = mr60.predict_log_proba(df)[:,1]
    prob70 = mr70.predict_log_proba(df)[:,1]
    prob80 = mr80.predict_log_proba(df)[:,1]
    prob90 = mr90.predict_log_proba(df)[:,1]
    prob00 = mr00.predict_log_proba(df)[:,1]
    prob10 = mr10.predict_log_proba(df)[:,1]
    prob = pd.DataFrame([prob60,prob70,prob80,prob90,prob00,prob10])
    return prob.apply(np.argmax,axis=0)

In [21]:
pred = predict_multi_logit(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

In [22]:
print(sum(np.array(pred) == np.array(df_test["label"]))/len(pred))

0.4721030042918455


In [23]:
label_60 = df_train["label"]==0
label_60_test = df_test["label"]==0

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_60_test)/len(pred))

0.8390557939914163


In [24]:
label_70 = df_train["label"]==1
label_70_test = df_test["label"]==1

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_70_test)/len(pred))

0.8369098712446352


In [25]:
label_80 = df_train["label"]==2
label_80_test = df_test["label"]==2

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_80_test)/len(pred))

0.869098712446352


In [26]:
label_90 = df_train["label"]==3
label_90_test = df_test["label"]==3

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_90_test)/len(pred))

0.8347639484978541


In [27]:
label_00 = df_train["label"]==4
label_00_test = df_test["label"]==4

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_00_test)/len(pred))

0.869098712446352


In [28]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_10_test)/len(pred))

0.8583690987124464


In [29]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == label_10_test)/len(pred))

0.8283261802575107


In [30]:
mr = LogisticRegression(penalty='l1',solver='liblinear').fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.44206008583690987
3.0622317596566524


In [31]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.3540772532188841
4.954935622317596


In [32]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.4356223175965665
2.875536480686695


In [33]:
mr = LogisticRegression(penalty='l1',solver="liblinear").fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.39914163090128757
2.8648068669527897


In [34]:
# from sklearn.svm import SVC 
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

# clf = make_pipeline(StandardScaler(with_mean=False),SVC(gamma='auto'))
# clf.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
# pred = clf.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

# print(sum(pred == df_test["label"])/len(pred))
# print(np.mean((pred-df_test["label"])**2))

In [35]:
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.gaussian_process.kernels import RBF
# krr = KernelRidge(kernel="rbf")
# krr.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
# pred = krr.predict(data_test.iloc[:,:(data_test.shape[1]-1)])

# print(sum(pred == df_test["label"])/len(pred))
# print(np.mean((pred-df_test["label"])**2))

In [1]:
# from sklearn.model_selection import GridSearchCV

# np.random.seed(3701)
# idx60 = []
# label_60 = df_train["label"]==0
# num_60 = sum(label_60==0)
# label_60 = list(label_60)
# for i in range(len(label_60)):
#     if label_60[i]:
#         idx60.append(i)
# boot60 = np.random.choice(idx60,num_60,replace=True)
# dat_train_sp = csr_matrix(dat_train.iloc[:,:dat_train.shape[1]])
# dat_train_60 = csr_matrix(dat_train.loc[idx60,:dat_train.shape[1]])
# dat_train_60 = vstack([dat_train_sp,dat_train_60])
# test_label_60 = np.vstack(np.array(df_test["label"]),np.array(df_test.loc[idx60,"label"]))

In [38]:
df_train["label"] = np.zeros(df_train.shape[0])
bins = [1990,np.inf]
labels = [0,1]
df_train["label"] = np.where(df_train["year"] < bins[0], labels[0],labels[1])

df_test["label"] = np.zeros(df_test.shape[0])
df_test["label"] = np.where(df_test["year"] < bins[0], labels[0],labels[1])

In [39]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],df_train["label"])
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == df_test["label"])/len(pred))

0.6738197424892703


In [57]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':1/ np.log(np.linspace(np.exp(1e-3),np.exp(1.5),num=100,dtype=np.float64))}
logclf = LogisticRegression(penalty="l2",dual=True,solver="liblinear")
clf = GridSearchCV(logclf,param_grid=parameters,
                   scoring='f1',n_jobs=4)
clf.fit(data_tfidf_train.iloc[:,:(data_train.shape[1]-1)],df_train["label"])

In [68]:
pprint.pprint(clf.cv_results_)

{'mean_fit_time': array([1.41180372, 1.2727303 , 1.22065654, 1.2714325 , 1.52368474,
       1.51715236, 1.40956945, 1.49648304, 1.52737727, 1.57335234,
       1.53335724, 1.57915368, 1.546942  , 1.76027193, 1.56390519,
       1.60132275, 1.70574627, 1.46470299, 1.60676522, 1.40437431,
       1.4265183 , 1.38369999, 1.41751981, 1.39101629, 1.49465404,
       1.48332739, 1.45793247, 1.3847281 , 1.53913317, 1.46287117,
       1.45350394, 1.49493423, 1.37179093, 1.56450915, 1.50234694,
       1.48528709, 1.75690951, 1.62265148, 1.50505385, 1.58115115,
       1.46761723, 1.55904469, 1.59301558, 1.59314494, 1.63218269,
       1.47640834, 1.55742393, 1.50661755, 1.64142399, 1.4678205 ,
       1.51535621, 1.55767283, 1.44833984, 1.52841578, 1.4756609 ,
       1.40899334, 1.53922544, 1.55364232, 1.60864391, 1.57363257,
       1.65725241, 1.52789621, 1.48743582, 1.65867229, 1.48501925,
       1.5037323 , 1.4991365 , 1.47330127, 1.54285359, 1.4818615 ,
       1.53757129, 1.57600589, 1.67857423, 1

In [75]:
logclf = LogisticRegression(C=3.05,dual=True,penalty="l2",solver="liblinear")
logclf.fit(np.array(data_tfidf_train.iloc[:,:(data_train.shape[1]-1)]),df_train["label"])
pred = logclf.predict(np.array(data_tfidf_test.iloc[:,:(data_test.shape[1]-1)]))

print(sum(pred == df_test["label"])/len(pred))
print("PPV",sum(pred[df_test["label"]==1]==1)/len(pred[df_test["label"]==1]))
print("PRECISION",sum(df_test.loc[pred==1,"label"]==1)/len(df_test.loc[pred==1,"label"]))

0.7875536480686696
PPV 0.7489177489177489
PRECISION 0.8084112149532711


# Changing to generation
Frequency may have some concerns

In [82]:
logclf = LogisticRegression(dual=True,penalty="l2",solver="liblinear")
logclf.fit(data_tfidf_train.iloc[:,len(gen_des):(data_train.shape[1]-1)],data_train.iloc[:,1])
pred = logclf.predict(data_tfidf_test.iloc[:,len(gen_des):(data_test.shape[1]-1)])

print(sum(pred == data_test.iloc[:,1])/len(pred))
print("PPV",sum(pred[data_test.iloc[:,1]==1]==1)/len(pred[data_test.iloc[:,1]==1]))
print("PRECISION",sum(data_test.loc[pred==1,1]==1)/len(data_test.loc[pred==1,1]))

0.9399141630901288
PPV 0.0


KeyError: 1

In [78]:
pprint.pprint(data_tfidf_train.iloc[:,0])

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4650    0.0
4651    0.0
4652    0.0
4653    0.0
4654    0.0
Name: alternative, Length: 4655, dtype: Sparse[float64, 0]


In [89]:
pprint.pprint(np.sum(data_train.iloc[:,0:13],axis=0))

alternative      1
country        200
dance          175
disco          128
folk            49
funk           120
hip              2
new              0
pop            983
r&b            529
rap            157
rock           948
soul           374
dtype: int64


In [74]:
pprint.pprint(logclf.coef_)

array([[1.4067885 , 0.07769131, 0.81636606, ..., 0.40091202, 0.46572522,
        0.5508126 ]])


In [65]:
data_tfidf_train.columns

Index(['alternative', 'country', 'dance', 'disco', 'folk', 'funk', 'hip',
       'new', 'pop', 'r&b',
       ...
       'hitta', 'hittas', 'rari', 'panda', 'offset', 'swalla', 'thun', 'cardi',
       'roxanne', 'label'],
      dtype='object', length=3887)