In [1]:
#!pip install scikit-learn==0.23.1

In [1]:
import numpy as np
import pandas as pd
import re
import sklearn
import nltk
import gensim
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import LatentDirichletAllocation

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore

# Data cleaning and splitting

In [2]:
#import dataset, remove the 'link' column and no match' rows
song_data = pd.read_csv("subsongdata_57650.csv")
song_data = song_data[['artist','song','text','explicit_label']]
song_data = song_data.loc[song_data['explicit_label'] != 'no match']

In [3]:
#remove'\n' from the lyrics
re_drop = re.compile(r'\n')        
song_data[['text']] = song_data[['text']].applymap(lambda x:re_drop.sub(' ',x))
song_data

Unnamed: 0,artist,song,text,explicit_label
1,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently...",False
2,ABBA,As Good As New,I'll never know why I had to go Why I had to...,False
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,False
7,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're ench...",False
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o...",False
...,...,...,...,...
57593,Zao,To Think Of You Is To Treasure An Absent Memory,When you shut your eyes and fell asleep Dark...,False
57605,Zebra,As I Said Before,And I said before I don't want no more And...,False
57608,Zebra,Hard Living Without You,Nothing to say no place to hide I can't find...,False
57609,Zebra,When You Get There,You wake up in the morning And you're not fe...,False


In [4]:
#extract all the rows with explicit_label = True
song_data_1 = song_data.loc[song_data['explicit_label'] == 'True']
song_data_1

Unnamed: 0,artist,song,text,explicit_label
128,Adam Sandler,Mr. Bake-O,I'm sitting in my chair watching the ...,True
129,Adam Sandler,Steve Polychronopolous,I'm a big fuckin' dick I'm a pain in your as...,True
909,Avril Lavigne,Here's To Never Growing Up,Singing Radiohead at the top of our lungs Wi...,True
929,Avril Lavigne,Rock N Roll,Let them know that we're still rock n roll I...,True
2089,Bruno Mars,Treasure,"Give me all, give me all, give me all your att...",True
...,...,...,...,...
57544,Z-Ro,Gripping Grain,"What, is this bitch fucking with Z-Ro Mother...",True
57545,Z-Ro,Guerilla Till I Die,"[Hook - 2x] Guerilla till I die, mama don't ...",True
57550,Z-Ro,Lord Tell Me Why,"(Chorus - 2x) Lord tell me why Ooh child, ...",True
57560,Z-Ro,Talkin' Down On Me,"[Z-Ro] Since I been out by myself, I keep th...",True


In [5]:
#ramdomly extract 4068 rows with explicit_label = False, which is 3 times as many as song_data_1
song_data_0 = song_data.loc[song_data['explicit_label'] == 'False']
song_data_0 = song_data_0.sample(n=4068, replace=False, random_state=100)
song_data_0

Unnamed: 0,artist,song,text,explicit_label
51337,Santana,Somewhere In Heaven,Somewhere In heaven There is a place Wai...,False
28056,Cliff Richard,Can't Take The Hurt Anymore,Now that the past has ended My life's an ope...,False
46109,Nitty Gritty Dirt Band,Stand A Little Rain,Dark cloud is coming Headed straight for you...,False
19827,U2,Dirty Day,I don't know you And you don't know the half...,False
44161,Misfits,Only Make Believe,People see us everywhere They think you real...,False
...,...,...,...,...
17703,Rihanna,We All Want Love,"We all, we all, we all, we all, we all We al...",False
48905,Pretenders,2000 Miles,He's gone two thousand miles It's very far ...,False
22776,ZZ Top,My Mind Is Gone,"No keyboards, just Pearly Gates and a lot of s...",False
37852,Jimmy Buffett,Beach House On The Moon,"Cameron's getting logical, A Vulcan in disgu...",False


In [6]:
song_data = pd.concat([song_data_0,song_data_1], axis = 0)
song_data

Unnamed: 0,artist,song,text,explicit_label
51337,Santana,Somewhere In Heaven,Somewhere In heaven There is a place Wai...,False
28056,Cliff Richard,Can't Take The Hurt Anymore,Now that the past has ended My life's an ope...,False
46109,Nitty Gritty Dirt Band,Stand A Little Rain,Dark cloud is coming Headed straight for you...,False
19827,U2,Dirty Day,I don't know you And you don't know the half...,False
44161,Misfits,Only Make Believe,People see us everywhere They think you real...,False
...,...,...,...,...
57544,Z-Ro,Gripping Grain,"What, is this bitch fucking with Z-Ro Mother...",True
57545,Z-Ro,Guerilla Till I Die,"[Hook - 2x] Guerilla till I die, mama don't ...",True
57550,Z-Ro,Lord Tell Me Why,"(Chorus - 2x) Lord tell me why Ooh child, ...",True
57560,Z-Ro,Talkin' Down On Me,"[Z-Ro] Since I been out by myself, I keep th...",True


In [7]:
lyrics_data = []
for i in range(len(song_data)):
    text = song_data.iloc[i,2]
    lyrics_data.append(text)

In [8]:
#lyrics_data[0]
#lyrics_data[5423]
#len(lyrics_data)
lyrics_data

["Somewhere   In heaven   There is a place   Waiting for you and me      He made a promise   Gave every drop of Blood   Died on the cross   so we'd be free.      Oh Oh Oh Oh      Somewhere, Somewhere   In Heaven   There's a place   Waiting, waiting for you and me      He made a promise   Gave every drop of Blood   And died on the cross   So we'd be free.      Somewhere, Somewhere   In Heaven   There is a place   Waiting, waiting for you and me      He made a promise   When He gave every drop of His blood   Died on the cross   So we'd be free.  ",
 "Now that the past has ended   My life's an open door   I know that we could have been mended   Oh but I can't take the hurt anymore   Can't take the hurt anymore   Ooh ooh can't take the hurt anymore   The rainbow's over   But I've seen it leave before   A love that's been left in the corner   Ooh 'cause I can't take the hurt anymore   Can't take the hurt anymore   Ooh ooh can't take the hurt anymore   Lady of my lady's   Can't we find the r

In [9]:
lyrics_label = []
for i in range(len(song_data)):
    label = song_data.iloc[i,3]
    if label=="False":
        label = 0
    else:
        label = 1
    lyrics_label.append(label)

In [10]:
#lyrics_label[0]
#lyrics_label[5423]
#len(lyrics_label)
lyrics_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [11]:
train_data, test_data, train_label, test_label = train_test_split(lyrics_data, lyrics_label, test_size=0.3, random_state=100)

In [12]:
train_data

["Bawitdaba, da bang, da dang diggy diggy, diggy, said the boogie, said up jump the boogie      My name is kid   Kid rock!      Bawitdaba da bang da bang diggy diggy diggy   Shake the boogie said up jump the boogie   Bawitdaba da bang da bang diggy diggy diggy   Shake the boogie said up jump the boogie      Bawitdaba da bang da bang diggy diggy diggy   Shake the boogie said up jump the boogie   Bawitdaba da bang da bang diggy diggy diggy   Shake the boogie said up jump the boogie      And this is for the questions that don't have an answer   The midnight glances   And the topless dancers   The can of freaks      Cars packed with speakers   The g's with the forty's   And the chicks with beepers   The northern lights      And the southern comfort   And it don't even matter if your veins are punctured   All the crackheads, the critics, the cities   And all my heroes at the methodone clinics      All you bastards at the i.r.s.   For the crooked cops and the cluttered desks   For the shots 

In [13]:
train_label

[1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,


In [14]:
test_data

["Till I had you I didn't know   That I was missing out   Had to grow up and see the world   Through different shades of doubt   Give me one more chance to dream again   One more chance to feel again   Through your young heart   If only for one day help me try   I wanna see Christmas through your eyes   I want everything to be the way it used to be   Back to being a child again, thinking the world was mine   I wanna see Christmas, Christmas through your eyes   I see the rain, you see the rainbow   Hiding in the clouds   Never afraid to let your love show   Won't you show me how   Wanna learn how to believe again   Find the innocence in me again   Through your young heart   Help me find a way, help me try   I wanna see Christmas through your eyes   I want everything to be the way it used to be   Back to being a child again thinking the world was kind   I wanna see Christmas, Christmas through your eyes   I wanna see Christmas through your eyes   I want everything to be the way it used t

In [15]:
test_label

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,


# Classification Models

In [16]:
vectorizer = CountVectorizer(stop_words = 'english') #TfidfVectorizer()
vectorizer.fit(train_data)
train_vecs = vectorizer.transform(train_data)
test_vecs = vectorizer.transform(test_data)

### Logistic Regression

In [17]:
def classification(vectorizer, model, fit_vect=False):
  if fit_vect:
    vectorizer.fit(train_data)
    
  train_vecs = vectorizer.transform(train_data)
  test_vecs = vectorizer.transform(test_data)
    
  model.fit(train_vecs, train_label)

  train_preds = model.predict(train_vecs)
  train_f1 = f1_score(train_label, train_preds, average='micro')

  test_preds = model.predict(test_vecs)
  test_f1 = f1_score(test_label, test_preds, average='micro')

  cm = confusion_matrix(test_label, test_preds)
  print("Confusion Matrix : \n", cm, " \n")

  report = classification_report(test_label, test_preds)
  print(report)

  return train_f1,test_f1

### Turn parameter

In [18]:
number = list(np.arange(0.1, 1.0, 0.1))
for c in number:
    model = LogisticRegression(C=c)
    print(c)
    classification(vectorizer, model)

0.1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1184   43]
 [ 144  257]]  

              precision    recall  f1-score   support

           0       0.89      0.96      0.93      1227
           1       0.86      0.64      0.73       401

    accuracy                           0.89      1628
   macro avg       0.87      0.80      0.83      1628
weighted avg       0.88      0.89      0.88      1628

0.2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1181   46]
 [ 142  259]]  

              precision    recall  f1-score   support

           0       0.89      0.96      0.93      1227
           1       0.85      0.65      0.73       401

    accuracy                           0.88      1628
   macro avg       0.87      0.80      0.83      1628
weighted avg       0.88      0.88      0.88      1628

0.30000000000000004


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1178   49]
 [ 138  263]]  

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1227
           1       0.84      0.66      0.74       401

    accuracy                           0.89      1628
   macro avg       0.87      0.81      0.83      1628
weighted avg       0.88      0.89      0.88      1628

0.4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1178   49]
 [ 138  263]]  

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1227
           1       0.84      0.66      0.74       401

    accuracy                           0.89      1628
   macro avg       0.87      0.81      0.83      1628
weighted avg       0.88      0.89      0.88      1628

0.5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1173   54]
 [ 136  265]]  

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1227
           1       0.83      0.66      0.74       401

    accuracy                           0.88      1628
   macro avg       0.86      0.81      0.83      1628
weighted avg       0.88      0.88      0.88      1628

0.6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1171   56]
 [ 136  265]]  

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1227
           1       0.83      0.66      0.73       401

    accuracy                           0.88      1628
   macro avg       0.86      0.81      0.83      1628
weighted avg       0.88      0.88      0.88      1628

0.7000000000000001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1168   59]
 [ 137  264]]  

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1227
           1       0.82      0.66      0.73       401

    accuracy                           0.88      1628
   macro avg       0.86      0.81      0.83      1628
weighted avg       0.88      0.88      0.87      1628

0.8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion Matrix : 
 [[1168   59]
 [ 136  265]]  

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1227
           1       0.82      0.66      0.73       401

    accuracy                           0.88      1628
   macro avg       0.86      0.81      0.83      1628
weighted avg       0.88      0.88      0.88      1628

0.9
Confusion Matrix : 
 [[1167   60]
 [ 136  265]]  

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1227
           1       0.82      0.66      0.73       401

    accuracy                           0.88      1628
   macro avg       0.86      0.81      0.83      1628
weighted avg       0.88      0.88      0.88      1628



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
model = LogisticRegression(C=0.1)
classification(vectorizer, model)

Confusion Matrix : 
 [[1184   43]
 [ 144  257]]  

              precision    recall  f1-score   support

           0       0.89      0.96      0.93      1227
           1       0.86      0.64      0.73       401

    accuracy                           0.89      1628
   macro avg       0.87      0.80      0.83      1628
weighted avg       0.88      0.89      0.88      1628



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.9847207586933614, 0.8851351351351351)

# Topic Modeling

## LDA

### for explict lyrics

Approach 1

In [20]:
explict_lyrics = song_data_1.text
explict_lyrics

128               I'm sitting in my chair watching the ...
129      I'm a big fuckin' dick   I'm a pain in your as...
909      Singing Radiohead at the top of our lungs   Wi...
929      Let them know that we're still rock n roll   I...
2089     Give me all, give me all, give me all your att...
                               ...                        
57544    What, is this bitch fucking with Z-Ro   Mother...
57545    [Hook - 2x]   Guerilla till I die, mama don't ...
57550    (Chorus - 2x)   Lord tell me why   Ooh child, ...
57560    [Z-Ro]   Since I been out by myself, I keep th...
57561    [Z-Ro]   Houston Texas is my city, Ridgemont 4...
Name: text, Length: 1356, dtype: object

In [21]:
explict_lyrics_list = song_data_1.text.values.tolist()
explict_lyrics_list

["         I'm sitting in my chair watching the TV   It's not even on but there's plenty for me to see   I just lit some crazy ass shit   that my friend overnight mailed to me      I'm fucking wasted   It's the best shit I ever tasted   I think they fucking laced it   Cause I'm so damn lambasted      Oh my friend came over so I packed him a pipe   I told him he better go easy with this shit but he didn't believe the hype   He sparked three bows just to show he could take it   Two minutes later he was playing backgammon naked      He's fucking wasted   It's the best shit he ever tasted   He's lost in fucking spaced-ed   Cause he's so wicked wicked wasted      Oh I spent the last two hours   hiding under my bed   Cause I looked in the garbage can   and I think I saw my Uncle Louie's head      I'm fucking wasted      Well my friend blew a hit into my pet bird's face   The bird laughed hysterically and started to moonwalk all over the place   He tripped over the toaster wire and fell on hi

In [22]:
lda_tfidf = TfidfVectorizer(stop_words = 'english')
explict_lyrics_vecs = lda_tfidf.fit_transform(explict_lyrics_list)

In [26]:
lda = LatentDirichletAllocation(n_components = 5, random_state = 100)
lda.fit(explict_lyrics_vecs)

LatentDirichletAllocation(n_components=5, random_state=100)

In [27]:
for index, topic in enumerate(lda.components_):
    print(f'Top 15 words for Topic # {index}')
    print([lda_tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words for Topic # 0
['pon', 'carnival', 'diggy', 'angelfuck', 'patna', 'woogie', 'turkee', 'carnage', 'tek', 'nuh', 'seh', 'gal', 'mi', 'di', 'yuh']


Top 15 words for Topic # 1
['ruca', 'aura', 'twilight', 'cigaro', 'calmer', 'gwaan', 'evolution', 'canned', 'winkle', 'nying', 'birdy', 'hellion', 'ramona', 'phresh', 'starfuckers']


Top 15 words for Topic # 2
['wasteland', 'ringmaster', 'nicole', 'garment', 'steals', 'hem', 'hypocrites', 'superficial', 'hott', 'purpose', 'bicken', 'gunn', 'tormentor', 'brazy', 'fikir']


Top 15 words for Topic # 3
['shit', 'ain', 'cause', 'nigga', 'want', 'll', 'fuck', 'yeah', 'love', 'oh', 'just', 'know', 'got', 'don', 'like']


Top 15 words for Topic # 4
['jugglin', 'hahn', 'lizard', 'perverts', 'juggalas', 'hanh', 'hannah', 'apocalypse', 'peacock', 'ion', 'widowmaker', 'ebin', 'superstitious', 'sikamikanico', 'demonomania']




In [28]:
print(lda.components_)

[[0.20005162 0.20002057 0.20003855 ... 0.20003619 0.20004542 0.20003115]
 [0.20005179 0.20002062 0.20003868 ... 0.20003631 0.20317633 0.20003122]
 [0.20005193 0.20002067 0.20003878 ... 0.20003641 0.20004616 0.20003131]
 [0.30763584 0.47020678 0.26726226 ... 0.24677513 0.38376316 0.289582  ]
 [0.2000519  0.20002066 0.20003876 ... 0.20003639 0.20004561 0.20003129]]


Approach 2

In [29]:
len(STOPWORDS)
stopwords = set(STOPWORDS)
stopwords.update(["yeah", "oh", "ya", "let"])
len(stopwords)

341

In [30]:
def lemmatization(text):
    wnl = WordNetLemmatizer()
    lemmatization = wnl.lemmatize(text, pos='n')
    return lemmatization

In [31]:
def lemmatization_stemming(text):
    wnl = WordNetLemmatizer()
    lemmatization = wnl.lemmatize(text, pos='n')
    
    stemmer = PorterStemmer()
    stemming = stemmer.stem(lemmatization)
    return stemming

In [32]:
#token = simple_preprocess(explict_lyrics_data[0])
#token

In [33]:
def text_preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stopwords and len(token) > 3:
            result.append(lemmatization_stemming(token))
    return result

In [34]:
processed_lyrics = song_data_1['text'].map(text_preprocess)
processed_lyrics

128      [sit, chair, watch, plenti, crazi, shit, frien...
129      [fuckin, dick, pain, drink, beer, slice, charl...
909      [sing, radiohead, lung, boom, blare, fall, lov...
929      [know, rock, roll, care, makeup, like, better,...
2089     [attent, babi, tell, littl, wonder, flawless, ...
                               ...                        
57544    [bitch, fuck, motherfuck, citi, bitch, know, h...
57545    [hook, guerilla, till, mama, come, blast, guer...
57550    [choru, lord, tell, child, thing, gonna, easie...
57560    [bitch, nina, enemi, want, nigga, seen, hater,...
57561    [houston, texa, citi, ridgemont, block, trust,...
Name: text, Length: 1356, dtype: object

In [35]:
dictionary = Dictionary(processed_lyrics)
dictionary.filter_extremes()

In [36]:
#len(dictionary)
#print(dictionary)

In [37]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_lyrics]
#bow_corpus

In [38]:
tfidf_model = TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

In [39]:
# bag of words lda model
lda_model = LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.014*"time" + 0.013*"want" + 0.011*"love" + 0.011*"need" + 0.010*"come" + 0.009*"feel" + 0.008*"shit" + 0.008*"tell" + 0.008*"everybodi" + 0.007*"wanna"
Topic: 1 
Words: 0.016*"girl" + 0.012*"come" + 0.010*"feel" + 0.010*"time" + 0.010*"babi" + 0.010*"shit" + 0.009*"right" + 0.008*"nigga" + 0.008*"life" + 0.007*"wanna"
Topic: 2 
Words: 0.029*"nigga" + 0.013*"bitch" + 0.013*"shit" + 0.012*"want" + 0.009*"come" + 0.009*"tell" + 0.007*"choru" + 0.007*"money" + 0.007*"feel" + 0.007*"right"
Topic: 3 
Words: 0.032*"love" + 0.012*"babi" + 0.011*"bitch" + 0.010*"want" + 0.010*"nigga" + 0.010*"girl" + 0.007*"come" + 0.007*"choru" + 0.007*"gotta" + 0.007*"time"
Topic: 4 
Words: 0.014*"want" + 0.013*"come" + 0.011*"nigga" + 0.011*"bitch" + 0.010*"love" + 0.010*"shit" + 0.009*"littl" + 0.009*"right" + 0.008*"choru" + 0.007*"time"


In [40]:
# tfidf lda model num_topics=5
lda_model2 = LdaMulticore(tfidf_corpus, num_topics=5, id2word=dictionary, workers=2)
for idx, topic in lda_model2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.003*"nigga" + 0.003*"want" + 0.003*"love" + 0.003*"feel" + 0.003*"choru" + 0.002*"wanna" + 0.002*"bitch" + 0.002*"live" + 0.002*"shit" + 0.002*"come"
Topic: 1 
Words: 0.005*"nigga" + 0.003*"choru" + 0.003*"girl" + 0.003*"babi" + 0.003*"love" + 0.002*"gotta" + 0.002*"need" + 0.002*"feel" + 0.002*"shit" + 0.002*"want"
Topic: 2 
Words: 0.006*"nigga" + 0.003*"girl" + 0.003*"love" + 0.003*"bitch" + 0.003*"money" + 0.003*"want" + 0.003*"shit" + 0.002*"right" + 0.002*"feel" + 0.002*"time"
Topic: 3 
Words: 0.004*"love" + 0.003*"hate" + 0.003*"babi" + 0.003*"nigga" + 0.002*"want" + 0.002*"feel" + 0.002*"choru" + 0.002*"come" + 0.002*"money" + 0.002*"bitch"
Topic: 4 
Words: 0.004*"nigga" + 0.003*"love" + 0.003*"bitch" + 0.003*"babi" + 0.003*"shit" + 0.003*"want" + 0.003*"choru" + 0.002*"good" + 0.002*"thing" + 0.002*"girl"


In [41]:
pyLDAvis.enable_notebook()
id2word = dictionary
vis = pyLDAvis.gensim.prepare(lda_model2, tfidf_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Approach 3

In [66]:
lda_model3 = gensim.models.ldamodel.LdaModel(tfidf_corpus, num_topics=2, id2word=dictionary)
for idx, topic in lda_model3.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.003*"nigga" + 0.003*"babi" + 0.003*"want" + 0.003*"love" + 0.003*"time" + 0.003*"bitch" + 0.003*"choru" + 0.002*"girl" + 0.002*"come" + 0.002*"gonna"
Topic: 1 
Words: 0.005*"nigga" + 0.003*"love" + 0.003*"shit" + 0.003*"feel" + 0.003*"choru" + 0.003*"girl" + 0.002*"bitch" + 0.002*"wanna" + 0.002*"need" + 0.002*"life"


In [67]:
pyLDAvis.enable_notebook()
id2word = dictionary
vis = pyLDAvis.gensim.prepare(lda_model3, tfidf_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### Remove dirty words

In [44]:
file = open('abbo.txt','r')
file = list(file)
bad_words = []
for w in file:
    bad_words.append(re.sub(r'\n','',w))
bad_words

['abbo',
 'abo',
 'abortion',
 'abuse',
 'addict',
 'addicts',
 'adult',
 'africa',
 'african',
 'alla',
 'allah',
 'alligatorbait',
 'amateur',
 'american',
 'anal',
 'analannie',
 'analsex',
 'angie',
 'angry',
 'anus',
 'arab',
 'arabs',
 'areola',
 'argie',
 'aroused',
 'arse',
 'arsehole',
 'asian',
 'ass',
 'assassin',
 'assassinate',
 'assassination',
 'assault',
 'assbagger',
 'assblaster',
 'assclown',
 'asscowboy',
 'asses',
 'assfuck',
 'assfucker',
 'asshat',
 'asshole',
 'assholes',
 'asshore',
 'assjockey',
 'asskiss',
 'asskisser',
 'assklown',
 'asslick',
 'asslicker',
 'asslover',
 'assman',
 'assmonkey',
 'assmunch',
 'assmuncher',
 'asspacker',
 'asspirate',
 'asspuppies',
 'assranger',
 'asswhore',
 'asswipe',
 'athletesfoot',
 'attack',
 'australian',
 'babe',
 'babies',
 'backdoor',
 'backdoorman',
 'backseat',
 'badfuck',
 'balllicker',
 'balls',
 'ballsack',
 'banging',
 'baptist',
 'barelylegal',
 'barf',
 'barface',
 'barfface',
 'bast',
 'bastard ',
 'bazonga

In [49]:
stopwords = set(STOPWORDS)
stopwords.update(bad_words)
stopwords.update(["yeah","oh", "ya", "let"])
len(stopwords)

1723

In [50]:
def text_preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stopwords and len(token) > 3:
            result.append(lemmatization_stemming(token))
    return result

In [51]:
processedlyrics = song_data_1['text'].map(text_preprocess)

Ddictionary = Dictionary(processedlyrics)
Ddictionary.filter_extremes()

Bow_corpus = [Ddictionary.doc2bow(doc) for doc in processedlyrics]

Tfidf_model = TfidfModel(Bow_corpus)
Tfidf_corpus = Tfidf_model[Bow_corpus]

In [53]:
lda_model4 = gensim.models.ldamodel.LdaModel(Tfidf_corpus, num_topics=5, id2word=Ddictionary)
for idx, topic in lda_model4.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.003*"love" + 0.003*"babi" + 0.003*"wanna" + 0.003*"nigga" + 0.002*"feel" + 0.002*"wait" + 0.002*"gonna" + 0.002*"need" + 0.002*"tell" + 0.002*"choru"
Topic: 1 
Words: 0.003*"love" + 0.003*"want" + 0.003*"choru" + 0.003*"girl" + 0.003*"babi" + 0.003*"life" + 0.003*"said" + 0.003*"tonight" + 0.003*"feel" + 0.003*"come"
Topic: 2 
Words: 0.004*"love" + 0.004*"money" + 0.003*"choru" + 0.003*"want" + 0.003*"time" + 0.002*"right" + 0.002*"think" + 0.002*"come" + 0.002*"nigga" + 0.002*"life"
Topic: 3 
Words: 0.004*"feel" + 0.003*"girl" + 0.003*"love" + 0.003*"choru" + 0.003*"come" + 0.003*"want" + 0.003*"time" + 0.003*"hate" + 0.002*"night" + 0.002*"think"
Topic: 4 
Words: 0.003*"want" + 0.003*"right" + 0.003*"need" + 0.002*"nigga" + 0.002*"love" + 0.002*"choru" + 0.002*"babi" + 0.002*"feel" + 0.002*"look" + 0.002*"wanna"


### for normal lyrics

In [54]:
stopwords = set(STOPWORDS)
#stopwords.update(bad_words)
stopwords.update(["yeah","oh", "ya", "let"])
len(stopwords)

341

In [55]:
def text_preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stopwords and len(token) > 3:
            result.append(lemmatization_stemming(token))
    return result

In [59]:
processed_normal_lyrics = song_data_0['text'].map(text_preprocess)

normal_dictionary = Dictionary(processed_normal_lyrics)
normal_dictionary.filter_extremes()

bow_corpus_normal = [normal_dictionary.doc2bow(doc) for doc in processed_normal_lyrics]

tfidf_model_normal = TfidfModel(bow_corpus_normal)
tfidf_corpus_normal = tfidf_model_normal[bow_corpus_normal]

lda_model_normal = gensim.models.ldamodel.LdaModel(tfidf_corpus_normal, num_topics=10, id2word=normal_dictionary)

for idx, topic in lda_model_normal.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.024*"weigh" + 0.020*"cat" + 0.013*"cowboy" + 0.011*"coin" + 0.009*"melodi" + 0.009*"fountain" + 0.009*"arrow" + 0.009*"lost" + 0.008*"eager" + 0.008*"corpor"
Topic: 1 
Words: 0.014*"abl" + 0.013*"round" + 0.011*"song" + 0.011*"spread" + 0.009*"puso" + 0.009*"wala" + 0.008*"crazi" + 0.008*"sunshin" + 0.007*"till" + 0.007*"built"
Topic: 2 
Words: 0.015*"admit" + 0.008*"physic" + 0.008*"throat" + 0.007*"cure" + 0.006*"frozen" + 0.006*"discov" + 0.006*"hymn" + 0.006*"wreck" + 0.006*"cool" + 0.005*"babi"
Topic: 3 
Words: 0.025*"awhil" + 0.018*"harder" + 0.015*"occup" + 0.014*"snow" + 0.012*"vocal" + 0.011*"sing" + 0.010*"colder" + 0.009*"row" + 0.009*"good" + 0.008*"honest"
Topic: 4 
Words: 0.016*"want" + 0.011*"love" + 0.010*"lift" + 0.010*"wrong" + 0.009*"bitter" + 0.009*"high" + 0.009*"mighti" + 0.009*"celebr" + 0.008*"aliv" + 0.008*"come"
Topic: 5 
Words: 0.007*"feel" + 0.006*"love" + 0.005*"safe" + 0.005*"tonight" + 0.005*"gone" + 0.005*"like" + 0.005*"littl" + 0.005

In [61]:
pyLDAvis.enable_notebook()
id2word = normal_dictionary
vis = pyLDAvis.gensim.prepare(lda_model_normal, tfidf_corpus_normal, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [62]:
lda_model_normal2 = gensim.models.ldamodel.LdaModel(tfidf_corpus_normal, num_topics=8, id2word=normal_dictionary)

for idx, topic in lda_model_normal2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"love" + 0.008*"want" + 0.007*"feel" + 0.007*"tonight" + 0.006*"thousand" + 0.005*"tomorrow" + 0.005*"bitter" + 0.005*"lisa" + 0.005*"mighti" + 0.005*"come"
Topic: 1 
Words: 0.011*"wanna" + 0.010*"occup" + 0.008*"seventi" + 0.007*"glorifi" + 0.007*"good" + 0.007*"love" + 0.007*"coin" + 0.006*"babi" + 0.006*"mule" + 0.006*"fountain"
Topic: 2 
Words: 0.017*"eddi" + 0.014*"cat" + 0.007*"aliv" + 0.005*"disintegr" + 0.005*"circumst" + 0.005*"think" + 0.005*"disco" + 0.005*"purpl" + 0.004*"depart" + 0.004*"destruct"
Topic: 3 
Words: 0.010*"bell" + 0.009*"lucki" + 0.008*"want" + 0.008*"born" + 0.008*"ring" + 0.008*"mama" + 0.007*"hallelujah" + 0.007*"ladi" + 0.007*"mother" + 0.006*"ooooh"
Topic: 4 
Words: 0.011*"fortress" + 0.008*"built" + 0.008*"tearin" + 0.007*"love" + 0.006*"darkest" + 0.005*"bald" + 0.005*"gonna" + 0.005*"bright" + 0.005*"chain" + 0.004*"caus"
Topic: 5 
Words: 0.007*"lost" + 0.006*"travelin" + 0.006*"walk" + 0.006*"morn" + 0.006*"aliv" + 0.006*"peop

In [63]:
pyLDAvis.enable_notebook()
id2word = normal_dictionary
vis = pyLDAvis.gensim.prepare(lda_model_normal2, tfidf_corpus_normal, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [64]:
lda_model_normal3 = gensim.models.ldamodel.LdaModel(tfidf_corpus_normal, num_topics=6, id2word=normal_dictionary)

for idx, topic in lda_model_normal3.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"awhil" + 0.008*"lucki" + 0.008*"good" + 0.007*"distanc" + 0.007*"hallelujah" + 0.007*"ride" + 0.006*"love" + 0.006*"sing" + 0.006*"ladi" + 0.006*"lost"
Topic: 1 
Words: 0.011*"bell" + 0.008*"sacr" + 0.008*"ring" + 0.008*"cowboy" + 0.007*"darkest" + 0.007*"wonder" + 0.005*"bald" + 0.005*"year" + 0.005*"time" + 0.004*"land"
Topic: 2 
Words: 0.016*"bitter" + 0.015*"puso" + 0.007*"dri" + 0.006*"hush" + 0.006*"landscap" + 0.004*"carol" + 0.003*"darl" + 0.003*"come" + 0.003*"love" + 0.003*"poverti"
Topic: 3 
Words: 0.012*"want" + 0.011*"abl" + 0.010*"lisa" + 0.007*"cure" + 0.006*"wala" + 0.005*"laugh" + 0.005*"choru" + 0.005*"feel" + 0.005*"warm" + 0.005*"caus"
Topic: 4 
Words: 0.009*"love" + 0.006*"feel" + 0.006*"tonight" + 0.005*"like" + 0.005*"come" + 0.005*"world" + 0.005*"life" + 0.005*"littl" + 0.004*"away" + 0.004*"think"
Topic: 5 
Words: 0.009*"weigh" + 0.008*"travelin" + 0.007*"mann" + 0.007*"built" + 0.007*"walk" + 0.006*"style" + 0.006*"harder" + 0.006*"rid

In [65]:
pyLDAvis.enable_notebook()
id2word = normal_dictionary
vis = pyLDAvis.gensim.prepare(lda_model_normal3, tfidf_corpus_normal, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [73]:
lda_model_normal4 = gensim.models.ldamodel.LdaModel(tfidf_corpus_normal, num_topics=4, id2word=normal_dictionary)

for idx, topic in lda_model_normal4.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"love" + 0.006*"want" + 0.006*"come" + 0.005*"feel" + 0.005*"tonight" + 0.005*"good" + 0.005*"caus" + 0.005*"walk" + 0.005*"wrong" + 0.004*"life"
Topic: 1 
Words: 0.006*"want" + 0.006*"love" + 0.006*"bitter" + 0.005*"burn" + 0.005*"fool" + 0.005*"gone" + 0.005*"bell" + 0.005*"lucki" + 0.005*"know" + 0.005*"away"
Topic: 2 
Words: 0.008*"abl" + 0.007*"mari" + 0.006*"puso" + 0.006*"choru" + 0.006*"wala" + 0.005*"safe" + 0.005*"cure" + 0.004*"love" + 0.004*"alright" + 0.004*"better"
Topic: 3 
Words: 0.010*"lisa" + 0.010*"lost" + 0.009*"mighti" + 0.008*"spread" + 0.007*"hallelujah" + 0.007*"peopl" + 0.007*"round" + 0.006*"littl" + 0.006*"ooooh" + 0.005*"life"


In [69]:
pyLDAvis.enable_notebook()
id2word = normal_dictionary
vis = pyLDAvis.gensim.prepare(lda_model_normal4, tfidf_corpus_normal, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [78]:
lda_model_normal5 = gensim.models.ldamodel.LdaModel(tfidf_corpus_normal, num_topics=2, id2word=normal_dictionary)

for idx, topic in lda_model_normal5.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"love" + 0.007*"want" + 0.005*"tonight" + 0.005*"feel" + 0.004*"think" + 0.004*"good" + 0.004*"come" + 0.004*"choru" + 0.004*"ride" + 0.004*"like"
Topic: 1 
Words: 0.005*"love" + 0.005*"world" + 0.004*"know" + 0.004*"gone" + 0.004*"feel" + 0.004*"like" + 0.004*"life" + 0.004*"want" + 0.004*"littl" + 0.004*"come"


In [80]:
pyLDAvis.enable_notebook()
id2word = normal_dictionary
vis = pyLDAvis.gensim.prepare(lda_model_normal5, tfidf_corpus_normal, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
