In [106]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import NMF

### Define a class for pre-processing and make DataFrame from URL from each artist

In [15]:
class lyric:
    def artist(url):
        soup = BeautifulSoup(requests.get(url).text,'html.parser')
        table = soup.find('table', {'class':'tdata'})
        href_links = []
        lyric_links=[]
        lyrics=[] 
        for link in table.findAll('a'):
            href_links.append(link.get('href'))

        for i in href_links:
            if ([i.startswith('/sublyric/'),i.startswith('/lyric-lf/')]):
                i = 'https://www.lyrics.com' + i
                lyric_links.append(i)

        for t in lyric_links:
            i = (BeautifulSoup(requests.get(t).text,'html.parser').find(id='lyric-body-text')).get_text()
            lyrics.append(i)
        df = pd.DataFrame(lyrics)
        df.rename(columns={0: 'lyric'}, inplace=True)
        df.drop_duplicates(subset ='lyric',keep = 'first' , inplace = True, ignore_index=True)
        df['lyric']=(df['lyric'].str.replace('\W', ' ')).str.lower()
        return df        

In [21]:
Billie_Rogue = lyric
Billie_Rogue = Billie_Rogue.artist('https://www.lyrics.com/artist/Billie-Rogue/2137945805')

  df['lyric']=(df['lyric'].str.replace('\W', ' ')).str.lower()


In [19]:
K_Ray = lyric
K_Ray = K_Ray.artist('https://www.lyrics.com/artist/K.-Ray/2137906147')

  df['lyric']=(df['lyric'].str.replace('\W', ' ')).str.lower()


### Add a column to each DataFrame specifying artist category + Concatinate both

In [24]:
Billie_Rogue.info() , K_Ray.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyric   4 non-null      object
dtypes: object(1)
memory usage: 160.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyric   12 non-null     object
dtypes: object(1)
memory usage: 224.0+ bytes


(None, None)

In [66]:
frames = [Billie_Rogue, K_Ray]
df = pd.concat(frames, ignore_index=True)
df['artist'] = ['Billie_Rogue'] * 4 + ['K_Ray'] * 12

### Split the DataFrame into Train & Test

In [90]:
X_train, X_test, y_train, y_test = train_test_split(df['lyric'], df['artist'], train_size=0.8, random_state=10)

In [91]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12,), (4,), (12,), (4,))

### Pre_Processing using TFIDF & Classification Model using Naive Bayes via Pipeline

In [123]:

steps = [('tf-idf', TfidfVectorizer(stop_words=None)),
         
          ('NB', MultinomialNB())
        ]

pipeline = Pipeline(steps) 


In [124]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tf-idf', TfidfVectorizer()), ('NB', MultinomialNB())])

In [125]:
pipeline.score(X_test, y_test)

0.75

In [126]:
pipeline.predict(["The child Must remain"])

array(['K_Ray'], dtype='<U12')

### Using NMF as decomposition matrix algorithms for topic extraction

In [83]:
vectorizer_tf = TfidfVectorizer(stop_words='english')
Y = vectorizer_tf.fit_transform(df['lyric'])
tfidf = pd.DataFrame(Y.toarray(),columns=vectorizer_tf.get_feature_names())

In [87]:
model = NMF(n_components=16, init='random', random_state=0)
W = model.fit_transform(tfidf)
H = model.components_

In [88]:
components_df = pd.DataFrame(model.components_, columns=vectorizer_tf.get_feature_names(), index=['Billie_Rogue'] * 4 + ['K_Ray'] * 12)
components_df

Unnamed: 0,10,11,12,14,17,23,40,420,45,80,...,yard,yeah,year,years,yelling,yes,yo,younger,zimmerman,zones
Billie_Rogue,0.0,0.1362999,0.0,0.0,0.0,0.0,0.0,0.0,0.2725997,0.0,...,0.0,0.0,0.0,0.0,0.1362999,0.0,0.0819216,0.0,0.0,0.0
Billie_Rogue,0.0,0.0,0.0,0.0,0.161895,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1686966,0.0,0.0,0.0,0.0,0.1946111,0.0,0.0,0.161895
Billie_Rogue,0.0,0.0,0.0,0.0,0.0,0.779509,0.0,0.0,0.0,0.0,...,0.0,2.754297e-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Billie_Rogue,0.0,0.0,0.0,0.0,0.0,0.0,0.3815012,0.0,0.0,0.0,...,0.0,0.993819,0.0,0.3815012,0.0,0.0,0.0,0.0,0.0,0.0
K_Ray,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2022053,0.0,0.0,0.0,0.0,0.4665347,0.0,0.0,0.0
K_Ray,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.802237e-09,0.0,0.0,0.0
K_Ray,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5294918,0.396037,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K_Ray,0.171054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2673597,0.0,0.0,0.0,0.0,0.5140508,0.0,0.0,0.0
K_Ray,0.0,0.0,0.0,0.7694263,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K_Ray,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.116436,0.0,0.0,0.0,0.4489044,0.0,0.0,0.0


### To evoke 10 most higher valued words for each lyric

In [89]:
for lyric in range(components_df.shape[0]):
    word = components_df.iloc[lyric]
    print(f'For topic {lyric+1} the words with the highest value are:')
    print(word.nlargest(10))
    print('\n')

For topic 1 the words with the highest value are:
let       1.750885
ya        1.061794
just      0.797274
freak     0.593500
act       0.545199
trying    0.426077
store     0.408900
sumin     0.408900
right     0.386107
fina      0.354451
Name: Billie_Rogue, dtype: float64


For topic 2 the words with the highest value are:
mind      0.504633
kyrie     0.323791
post      0.323791
shot      0.323791
like      0.278064
time      0.271263
girl      0.236748
point     0.229307
shorty    0.229307
ain       0.208548
Name: Billie_Rogue, dtype: float64


For topic 3 the words with the highest value are:
learned      0.974386
23           0.779509
debate       0.779509
empathy      0.779509
gonna        0.779509
oughta       0.779509
changed      0.678854
everybody    0.678854
patiently    0.678854
turned       0.678854
Name: Billie_Rogue, dtype: float64


For topic 4 the words with the highest value are:
girl         1.115778
yeah         0.993819
gone         0.993819
world        0.891863
l