In [62]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from collections import defaultdict
import pprint
import re
from gensim import models
from scipy.sparse import lil_matrix, hstack, csr_matrix, vstack
import gensim.downloader as api

# Data Preprocessing
In this section, we preprocess the data and transform raw text data to matrix form. Then, all data is divided into training set and test set. After that, a dictionary is built upon training set.

In [2]:
def specific_preprocess(doc):
    return simple_preprocess(doc,min_len=1)

def remove_specific_words(s):
    s = re.sub(r"\bLyrics\[.+\]"," ",s)
    s = re.sub(r"\[.+\]"," ",s)
    return s

df = pd.read_csv("data/billboard_lyrics_genres.csv")

df["lyrics"] = df["lyrics"].map(remove_specific_words)
df["lyrics"] = df["lyrics"].map(remove_stopwords)
df["lyrics"] = df["lyrics"].map(specific_preprocess)

Then delete the songs that are not English

In [3]:
def isEnglish(w):
    return w.encode("utf-8").isalpha()

def isListEnglish(L):
    return all(map(isEnglish,L))

df["isEnglish"] = df["lyrics"].map(isListEnglish)
df = df.loc[df["isEnglish"],:]

Similarly, perform the same procedure to genre

In [4]:
def remove_pun(s):
    s = re.sub(r"\[\'"," ",s)
    s = re.sub(r"\'\]"," ",s)
    s = re.sub(r"\'"," ",s)
    s = re.sub(r"\[\]"," ",s)
    s = re.sub(r"\,"," ",s)
    s = s.split()
    s = [token.lower() for token in s]
    return s


df["genre"] = df["genre"].map(remove_pun)

In [5]:
freq_gen = defaultdict(int)
for text in df["genre"]:
    for token in text:
        freq_gen[token] += 1

processed_corpus_gen = [[token for token in text if freq_gen[token]>20] for text in df.loc[:,"genre"]]
dict_gen = corpora.Dictionary(processed_corpus_gen)
freq_wanted = {k: v for k,v in freq_gen.items() if v > 100}
pprint.pprint(freq_wanted)

{'alternative': 146,
 'and': 157,
 'country': 416,
 'dance-pop': 155,
 'disco': 147,
 'folk': 141,
 'funk': 170,
 'hard': 102,
 'hip': 432,
 'hop': 374,
 'new': 168,
 'pop': 1381,
 'r&b': 665,
 'rap': 114,
 'rock': 1606,
 'roll': 111,
 'soft': 322,
 'soul': 476,
 'wave': 109}


In this way, we can sort out the genre we want is alternative, country, dance, disco, folk, funk, hip-hop, new wave, pop, r&b, rap, rock, soul (soft stands for soft rock)

In [6]:
gen_des = ["alternative","country","dance","disco","folk","funk","hip","new","pop","r&b","rap","rock","soul"]
gen_des = sorted(gen_des)

# Compute number of columns from maximum word ID in the training data
num_cols = len(gen_des)
dat_gen = lil_matrix((len(df), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    for word in row["genre"]:
        for k in range(len(gen_des)):
            if re.search(gen_des[k],word):
                dat_gen[i,k] = 1
            else:
                dat_gen[i,k] = 0
df[df["genre"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_gen = pd.DataFrame.sparse.from_spmatrix(dat_gen)

Then, we should tag the data for classification.

In [7]:
df["label"] = np.zeros(df.shape[0])

bins = [1970,1980,1990,2000,2010,np.inf]

labels = [0,1,2,3,4,5]

df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3],
                                                          np.where(df["year"] < bins[4], labels[4], labels[5])))))

Then, data is split to training set and test set.

In [8]:
np.random.seed(515)
idx = np.repeat(range(10),len(df.iloc[:,0])//10+1)
df["idx"] = np.random.choice(idx[range(len(df.iloc[:,0]))],size=len(df.iloc[:,0]))
df_train = df.loc[df["idx"]!=0,:]
df_test = df.loc[df["idx"]==0,:]

Build a dictionary based on training set.

In [9]:
freq = defaultdict(int)
for text in df_train["lyrics"]:
    for token in text:
        freq[token] += 1

processed_corpus = [[token for token in text if freq[token]>20] for text in df_train.loc[:,"lyrics"]]
dictionary = corpora.Dictionary(processed_corpus)
df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


In [10]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_train = lil_matrix((len(df_train), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"]]
    values = [value for _, value in row["freq_count"]]
    dat_train[i, indices] = values
df_train[df_train["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_train = pd.DataFrame.sparse.from_spmatrix(dat_train)

Then, perform the same procedure to test set with the dictionary.

In [11]:
df_test = df.loc[df["idx"]==0,:]
processed_corpus = [[token for token in text if freq[token]>20] for text in df_test.loc[:,"lyrics"]]
df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_test = lil_matrix((len(df_test), num_cols), dtype=np.int64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["freq_count"] if count < num_cols]
    values = [value for count, value in row["freq_count"] if count < num_cols and value!=0]
    dat_test[i, indices] = values
df_test[df_test["freq_count"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_test = pd.DataFrame.sparse.from_spmatrix(dat_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["freq_count"] = [dictionary.doc2bow(text) for text in processed_corpus]


# Perform TF-IDF

In [12]:
bow_corpus = list(df_train["freq_count"])
tfidf = models.TfidfModel(bow_corpus)
df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["tfidf"]=tfidf[df_train["lyrics"].map(dictionary.doc2bow)]


In [13]:
# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_train = lil_matrix((len(df_train), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"]]
    values = [value for _, value in row["tfidf"]]
    dat_tfidf_train[i, indices] = values
df_train[df_train["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_train = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_train)

In [14]:
df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]

# Compute number of columns from maximum word ID in the training data
num_cols = max(dictionary.keys())+1
dat_tfidf_test = lil_matrix((len(df_test), num_cols), dtype=np.float64)

# Fill in values using apply() and enumerate()
def set_row_func(i, row):
    indices = [count for count, word_id in row["tfidf"] if count < num_cols]
    values = [value for count, value in row["tfidf"] if count < num_cols and value != 0]
    dat_tfidf_test[i, indices] = values
df_test[df_test["tfidf"].map(len) > 0].reset_index(drop=True).reset_index().apply(lambda row: set_row_func(row["index"], row), axis=1)

# Convert to pandas DataFrame
dat_tfidf_test = pd.DataFrame.sparse.from_spmatrix(dat_tfidf_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tfidf"]=tfidf[df_test["lyrics"].map(dictionary.doc2bow)]


# Processed Data
The data processed are diveded into the blow categories:

Original word frequency + genre

TF-IDF word frequency + genre


In [15]:
dat_gen = dat_gen.reset_index()
df = df.reset_index(drop=True)
dat_gen_train = dat_gen.loc[df["idx"]!=0,:].reset_index(drop=True)
dat_gen_test = dat_gen.loc[df["idx"]==0,:].reset_index(drop=True)

In [16]:
train_label = csr_matrix(df_train.loc[:,"label"]).transpose()
test_label = csr_matrix(df_test.loc[:,"label"]).transpose()

gen_train = csr_matrix(dat_gen_train.loc[:,0:])
lyrics_train = csr_matrix(dat_train.loc[:,0:])
data_train = hstack([gen_train, lyrics_train,train_label])
data_train = pd.DataFrame.sparse.from_spmatrix(data_train)

gen_test = csr_matrix(dat_gen_test.loc[:,0:])
lyrics_test = csr_matrix(dat_test.loc[:,0:])
data_test = hstack([gen_test, lyrics_test,test_label])
data_test = pd.DataFrame.sparse.from_spmatrix(data_test)


lyrics_tfidf_train = csr_matrix(dat_tfidf_train.loc[:,0:])
data_tfidf_train = hstack([gen_train,lyrics_tfidf_train,train_label])
data_tfidf_train = pd.DataFrame.sparse.from_spmatrix(data_tfidf_train)

lyrics_tfidf_test = csr_matrix(dat_tfidf_test.loc[:,0:])
data_tfidf_test = hstack([gen_test,lyrics_tfidf_test,test_label])
data_tfidf_test = pd.DataFrame.sparse.from_spmatrix(data_tfidf_test)

In [17]:
word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
word_name = gen_des + word_name + ['label']
data_tfidf_test.columns = word_name
data_tfidf_train.columns = word_name
data_train.columns = word_name
data_test.columns = word_name

In [18]:
# data_tfidf_train.to_csv("data/train_tfidf_data.csv")
# data_tfidf_test.to_csv("data/test_tfidf_data.csv")
# data_train = hstack([lyrics_train,train_label])
# data_train = pd.DataFrame.sparse.from_spmatrix(data_train)
# data_test = hstack([lyrics_test,test_label])
# data_test = pd.DataFrame.sparse.from_spmatrix(data_test)
# word_name = [dictionary[i] for i in range(max(dictionary.keys())+1)]
# word_name = word_name+['label']
# data_train.columns = word_name
# data_test.columns = word_name

# data_train.to_csv("data/train_data_all.csv")
# data_test.to_csv("data/test_data_all.csv")

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.4721030042918455
2.9527896995708156


Direct Use of Multinomial Logistic Regression's Performance is very bad.

Hence, we consider here multi logistic scenario.

In [60]:
# train models
label_60 = df_train["label"]==0
mr60 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
label_70 = df_train["label"]==1
mr70 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
label_80 = df_train["label"]==2
mr80 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
label_90 = df_train["label"]==3
mr90 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
label_00 = df_train["label"]==4
mr00 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
label_10 = df_train["label"]==5
mr10 = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)

def predict_multi_logit(df):
    prob60 = mr60.predict_log_proba(df)[:,1]
    prob70 = mr70.predict_log_proba(df)[:,1]
    prob80 = mr80.predict_log_proba(df)[:,1]
    prob90 = mr90.predict_log_proba(df)[:,1]
    prob00 = mr00.predict_log_proba(df)[:,1]
    prob10 = mr10.predict_log_proba(df)[:,1]
    prob = pd.DataFrame([prob60,prob70,prob80,prob90,prob00,prob10])
    return prob.apply(np.argmax,axis=0)

In [21]:
pred = predict_multi_logit(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

In [22]:
print(sum(np.array(pred) == np.array(df_test["label"]))/len(pred))

0.4721030042918455


In [23]:
label_60 = df_train["label"]==0
label_60_test = df_test["label"]==0

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_60)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_60_test)/len(pred))

0.8390557939914163


In [24]:
label_70 = df_train["label"]==1
label_70_test = df_test["label"]==1

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_70)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_70_test)/len(pred))

0.8369098712446352


In [25]:
label_80 = df_train["label"]==2
label_80_test = df_test["label"]==2

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_80)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_80_test)/len(pred))

0.869098712446352


In [26]:
label_90 = df_train["label"]==3
label_90_test = df_test["label"]==3

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_90)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_90_test)/len(pred))

0.8347639484978541


In [27]:
label_00 = df_train["label"]==4
label_00_test = df_test["label"]==4

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_00)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_00_test)/len(pred))

0.869098712446352


In [28]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == label_10_test)/len(pred))

0.8583690987124464


In [29]:
label_10 = df_train["label"]==5
label_10_test = df_test["label"]==5

mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],label_10)
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == label_10_test)/len(pred))

0.8283261802575107


In [30]:
mr = LogisticRegression(penalty='l1',solver='liblinear').fit(data_tfidf_train.iloc[:,:(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.44206008583690987
3.0622317596566524


In [31]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,:len(gen_des)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,:len(gen_des)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.3540772532188841
4.954935622317596


In [32]:
mr = LogisticRegression(penalty='l2',dual=True,solver="liblinear").fit(data_tfidf_train.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_tfidf_test.iloc[:,len(gen_des):(data_tfidf_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.4356223175965665
2.875536480686695


In [33]:
mr = LogisticRegression(penalty='l1',solver="liblinear").fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
pred = mr.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

print(sum(pred == df_test["label"])/len(pred))
print(np.mean((pred-df_test["label"])**2))

0.39914163090128757
2.8648068669527897


In [34]:
# from sklearn.svm import SVC 
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

# clf = make_pipeline(StandardScaler(with_mean=False),SVC(gamma='auto'))
# clf.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
# pred = clf.predict(data_test.iloc[:,:(data_train.shape[1]-1)])

# print(sum(pred == df_test["label"])/len(pred))
# print(np.mean((pred-df_test["label"])**2))

In [35]:
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.gaussian_process.kernels import RBF
# krr = KernelRidge(kernel="rbf")
# krr.fit(data_train.iloc[:,:(data_train.shape[1]-1)],np.array(df_train["label"]))
# pred = krr.predict(data_test.iloc[:,:(data_test.shape[1]-1)])

# print(sum(pred == df_test["label"])/len(pred))
# print(np.mean((pred-df_test["label"])**2))

0.0
1.9715226190092192


In [65]:
from sklearn.model_selection import GridSearchCV

np.random.seed(3701)
idx60 = []
label_60 = df_train["label"]==0
num_60 = sum(label_60==0)
label_60 = list(label_60)
for i in range(len(label_60)):
    if label_60[i]:
        idx60.append(i)
boot60 = np.random.choice(idx60,num_60,replace=True)
dat_train_sp = csr_matrix(dat_train.iloc[:,:dat_train.shape[1]])
dat_train_60 = csr_matrix(dat_train.loc[idx60,:dat_train.shape[1]])
dat_train_60 = vstack([dat_train_sp,dat_train_60])
test_label_60 = np.vstack(np.array(df_test["label"]),np.array(df_test.loc[idx60,"label"]))

KeyError: '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 109, 110, 111, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 256, 257, 258, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 354, 355, 356, 357, 358, 359, 360, 361, 363, 364, 365, 366, 367, 368, 369, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 390, 391, 392, 393, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 414, 415, 416, 417, 418, 419, 420, 423, 424, 425, 426, 427, 428, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 459, 460, 461, 462, 463, 464, 466, 467, 468, 469, 470, 471, 472, 473, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 487, 488, 489, 490, 491, 493, 494, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 520, 521, 522, 523, 525, 526, 527, 528, 529, 531, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 548, 550, 551, 552, 553, 554, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 579, 580, 581, 583, 584, 585, 586, 587, 588, 589, 590, 592, 593, 594, 595, 596, 597, 599, 600, 601, 602, 604, 605, 606, 607, 608, 609, 610, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 639, 640, 641, 642, 643, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 678, 679, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 756, 757, 759, 760, 761, 762, 763, 764, 765, 766, 767, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 822, 823, 824, 825, 826, 828, 829, 830, 831, 832, 834, 835, 836, 837, 839, 840, 841, 842, 843] not in index'

In [72]:
train_label_60 = pd.concat([df_train.reset_index(drop=True)["label"],df_train.loc.reset_index(drop=True)[idx60,"label"]],axis=1,ignore_index=True)

AttributeError: '_LocIndexer' object has no attribute 'reset_index'

In [57]:
lr_60 = LogisticRegression(penalty="l2",solver="liblinear")
lr_60.fit()

1
