## Modeling 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

In [4]:
lyrics = pd.read_csv('../data/clean_lyrics.csv')

In [5]:
lyrics.head()

Unnamed: 0.1,Unnamed: 0,artist,lyrics
0,0,T.I.,Skooly & (We got London On Da Track) Woah Y...
1,1,T.I.,1 Ayy in my apartment a long time ago I knew...
2,2,T.I.,All I *Explative* bad *Explative*es I don't ...
3,3,T.I.,1 Now I don’t really care what you call me J...
4,4,T.I.,(over ) Miya hee miya ho miya hu miya ha-ha M...


In [9]:
lyrics['artist'].value_counts(normalize=True)

DMX                0.006309
6LACK              0.006309
2Pac               0.006309
Rich Homie Quan    0.006309
Beastie Boys       0.006309
                     ...   
Pretty Ricky       0.006309
Kent Jones         0.006309
Offset             0.006309
Travis Scott       0.006309
Ray Dalton         0.003155
Name: artist, Length: 159, dtype: float64

In [198]:
X = lyrics['lyrics']
y = lyrics['artist']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)

### Random Forest

In [199]:
pipe_forest = Pipeline([
    ('cv', CountVectorizer()),
    ('forest', RandomForestClassifier())
])

In [200]:
pipe_forest.fit(X_train,y_train)

print(pipe_forest.score(X_train,y_train))

print(pipe_forest.score(X_test,y_test))

1.0
0.34760705289672544


In [201]:
param_grid = {
 'forest__max_depth': [30, 60],
 'forest__max_features': ['auto', 'sqrt'],
 'forest__min_samples_leaf': [1, 4],
 'forest__n_estimators': [200, 800, 1800]
    }

In [203]:
#grid_search = GridSearchCV(pipe, param_grid, n_jobs=-1)

#grid_search.fit(X_train,y_train)

In [205]:
pipe_forest = Pipeline([
    ('cv', CountVectorizer()),
    ('forest', RandomForestClassifier(max_depth=60,max_features='auto',min_samples_leaf=1,n_estimators=2000))
])

In [206]:
pipe_forest.fit(X_train,y_train)

print(pipe_forest.score(X_train,y_train))

print(pipe_forest.score(X_test,y_test))

1.0
0.3501259445843829


In [207]:
pipe_forest.named_steps['forest'].feature_importances_

array([9.44952568e-06, 4.02572855e-06, 3.54450491e-06, ...,
       1.68168644e-05, 1.45182168e-06, 1.60587372e-05])

In [208]:
importances_df = pd.DataFrame({
    'words': pipe_forest.named_steps['cv'].get_feature_names(),
    'imps': pipe_forest.named_steps['forest'].feature_importances_
})

importances_df.sort_values(by='imps', ascending=False).head(30)

Unnamed: 0,words,imps
1960,beyoncé,0.004247
5192,dexter,0.003603
9969,jxmmi,0.003561
18262,teo,0.003468
20782,zoowap,0.003338
738,aight,0.003261
17948,swisha,0.003067
6772,fergie,0.003016
6477,explative,0.002924
18973,trippy,0.002805


In [209]:
test_preds = pipe.predict(X_test)

In [210]:
X_test

340       & Lil Phat I-N-D-E-P-E-N-D-E-N-T Do you know...
55        The heat is on the heat is on yeah The heat ...
428      It's just me and my *Explative*s And my famil...
765      Y'all tryna make a next mother*Explative*in' ...
467      1 It is I the B-I-G the B-O-I Me oh my ears t...
                              ...                        
63        Woo! Welcome to the bank Where you deposit Y...
1130     New car very noisy Come through and it's roar...
863      Aye aye Aye aye aye Woo woo woo woo   1 Well ...
465       I keep it playa while some choose to play it...
1427     Woah yeah  So I'm skrrting off the scene in a...
Name: lyrics, Length: 397, dtype: object

In [212]:
pd.DataFrame(zip(y_test, test_preds), columns= ['True','Predicted']).head(20)

Unnamed: 0,True,Predicted
0,Webbie,Birdman
1,Lyfe Jennings,Shop Boyz
2,Kid Cudi,Machine Gun Kelly
3,Bobby Shmurda,Bobby Shmurda
4,Big Boi,MF DOOM
5,50 Cent,Eminem
6,Plies,Rich The Kid
7,Lil Nas X,Kanye West
8,NF,NF
9,BROCKHAMPTON,BROCKHAMPTON


In [215]:
pipe_forest_tdif = Pipeline([
    ('tdif', TfidfVectorizer()),
    ('forest', RandomForestClassifier(max_depth=60,max_features='auto',min_samples_leaf=1,n_estimators=2000))
])

In [216]:
pipe_forest_tdif.fit(X_train,y_train)

print(pipe_forest_tdif.score(X_train,y_train))

print(pipe_forest_tdif.score(X_test,y_test))

1.0
0.3501259445843829


### KNN

In [41]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [42]:
pipe_knn.fit(X_train,y_train)

print(pipe_knn.score(X_train,y_train))

print(pipe_knn.score(X_test,y_test))

0.3409090909090909
0.07556675062972293


### Neural Nets

In [133]:
y_train

1193               Offset
700     Pharrell Williams
1430             Lil Tjay
1549                JAY-Z
1401         Moneybagg Yo
              ...        
669               Juicy J
942      Shelley FKA DRAM
200               50 Cent
261                 Plies
178            Trey Songz
Name: artist, Length: 1188, dtype: object

In [112]:
cv = CountVectorizer()

cv.fit(lyrics['lyrics'])

CountVectorizer()

In [115]:
lyrics_transformed = cv.fit_transform(lyrics['lyrics'],)
artist_transformed = cv.fit_transform(lyrics['artist'],)

In [117]:
words = pd.DataFrame(lyricstransformed.toarray(),columns=cv.get_feature_names())

In [125]:
X = words
y = lyrics['artist']

In [140]:
le = LabelEncoder()

X = words
y_le = le.fit_transform(lyrics['artist'])
y = to_categorical(y_le)

In [141]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [142]:
y.shape

(1585, 159)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=42)

In [145]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)

In [184]:
model = Sequential()
model.add(Dense(X_train_sc.shape[0], activation='relu',kernel_regularizer=l2(0.004)))
model.add(Dense(256, activation='relu',kernel_regularizer=l2(0.004)))
model.add(Dense(159, activation='softmax',kernel_regularizer=l2(0.004)))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

history = model.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=10,
    batch_size=None,
    verbose=2
)

Epoch 1/10
38/38 - 9s - loss: 12.3525 - acc: 0.0513 - val_loss: 10.8073 - val_acc: 0.1914
Epoch 2/10
38/38 - 7s - loss: 6.8029 - acc: 0.9689 - val_loss: 9.5941 - val_acc: 0.2746
Epoch 3/10
38/38 - 7s - loss: 5.0244 - acc: 0.9992 - val_loss: 8.2466 - val_acc: 0.2796
Epoch 4/10
38/38 - 7s - loss: 3.7772 - acc: 0.9992 - val_loss: 7.3079 - val_acc: 0.2947
Epoch 5/10
38/38 - 7s - loss: 2.8901 - acc: 1.0000 - val_loss: 6.6609 - val_acc: 0.3073
Epoch 6/10
38/38 - 7s - loss: 2.3102 - acc: 1.0000 - val_loss: 6.2177 - val_acc: 0.3048
Epoch 7/10
38/38 - 7s - loss: 1.9311 - acc: 1.0000 - val_loss: 5.8974 - val_acc: 0.2796
Epoch 8/10
38/38 - 7s - loss: 1.6819 - acc: 1.0000 - val_loss: 5.7161 - val_acc: 0.2897
Epoch 9/10
38/38 - 7s - loss: 1.5206 - acc: 1.0000 - val_loss: 5.5675 - val_acc: 0.2972
Epoch 10/10
38/38 - 8s - loss: 1.3919 - acc: 1.0000 - val_loss: 5.4300 - val_acc: 0.3023


In [178]:
test_preds = model.predict(X_test_sc)

In [179]:
test_label = np.argmax(test_preds, axis = 1)

In [180]:
test_pred_label = le.inverse_transform(test_label)

In [181]:
test_true  = np.argmax(y_test,axis =1)

In [182]:
label_true = le.inverse_transform(test_true)

In [183]:
pd.DataFrame(zip(label_true, test_pred_label), columns= ['True','Predicted']).head(20)

Unnamed: 0,True,Predicted
0,Ayo & Teo,Silento
1,Kirko Bangz,Kirko Bangz
2,MF DOOM,50 Cent
3,Three 6 Mafia,Future
4,Tyga,Wale
5,Young Thug,Young Thug
6,G-Eazy,ScHoolboy Q
7,Lil Tjay,Lil Tecca
8,Jim Jones,Rick Ross
9,Waka Flocka Flame,Waka Flocka Flame


In [173]:
model = Sequential()
model.add(Dense(X_train_sc.shape[0], activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(159, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

history = model.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=5,
    batch_size=None,
    verbose=2
)

Epoch 1/5
38/38 - 4s - loss: 6.0764 - acc: 0.0160 - val_loss: 4.8292 - val_acc: 0.0781
Epoch 2/5
38/38 - 4s - loss: 1.4784 - acc: 0.7231 - val_loss: 4.2637 - val_acc: 0.1788
Epoch 3/5
38/38 - 4s - loss: 0.1097 - acc: 0.9731 - val_loss: 4.1611 - val_acc: 0.2040
Epoch 4/5
38/38 - 4s - loss: 0.0342 - acc: 0.9916 - val_loss: 4.2916 - val_acc: 0.2091
Epoch 5/5
38/38 - 4s - loss: 0.0152 - acc: 0.9958 - val_loss: 4.1827 - val_acc: 0.2065


In [175]:
model = Sequential()
model.add(Dense(X_train_sc.shape[0], activation='relu',kernel_regularizer=l2(0.003)))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu',kernel_regularizer=l2(0.003)))
model.add(Dropout(0.3))
model.add(Dense(159, activation='softmax',kernel_regularizer=l2(0.003)))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

history = model.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=8,
    batch_size=None,
    verbose=2
)

Epoch 1/8
38/38 - 8s - loss: 12.2918 - acc: 0.0185 - val_loss: 10.5593 - val_acc: 0.0680
Epoch 2/8
38/38 - 6s - loss: 8.1718 - acc: 0.4806 - val_loss: 9.9417 - val_acc: 0.2242
Epoch 3/8
38/38 - 6s - loss: 6.1613 - acc: 0.8662 - val_loss: 9.3371 - val_acc: 0.2645
Epoch 4/8
38/38 - 7s - loss: 5.2123 - acc: 0.9613 - val_loss: 8.7100 - val_acc: 0.2594
Epoch 5/8
38/38 - 6s - loss: 4.5067 - acc: 0.9857 - val_loss: 8.0980 - val_acc: 0.2720
Epoch 6/8
38/38 - 7s - loss: 3.9497 - acc: 0.9891 - val_loss: 7.6050 - val_acc: 0.2922
Epoch 7/8
38/38 - 6s - loss: 3.4301 - acc: 0.9916 - val_loss: 7.1895 - val_acc: 0.2922
Epoch 8/8
38/38 - 7s - loss: 3.0817 - acc: 0.9924 - val_loss: 6.9015 - val_acc: 0.2846


### Ada Boost

In [68]:
pipe_ada = Pipeline([
    ('cv', CountVectorizer()),
    ('ada', AdaBoostClassifier())
])

In [69]:
pipe_ada.fit(X_train,y_train)

print(pipe_ada.score(X_train,y_train))

print(pipe_ada.score(X_test,y_test))

0.020202020202020204
0.012594458438287154
