In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

### Define a class for pre-processing and make DataFrame from URL from# each artist

In [5]:
class lyric:
    def artist(url):
        soup = BeautifulSoup(requests.get(url).text,'html.parser')
        table = soup.find('table', {'class':'tdata'})
        href_links = []
        lyric_links=[]
        lyrics=[] 
        for link in table.findAll('a'):
            href_links.append(link.get('href'))

        for i in href_links:
            if ([i.startswith('/sublyric/'),i.startswith('/lyric-lf/')]):
                i = 'https://www.lyrics.com' + i
                lyric_links.append(i)

        for t in lyric_links:
            i = (BeautifulSoup(requests.get(t).text,'html.parser').find(id='lyric-body-text')).get_text()
            lyrics.append(i)
        df = pd.DataFrame(lyrics)
        df.rename(columns={0: 'lyric'}, inplace=True)
        df.drop_duplicates(subset ='lyric',keep = 'first' , inplace = True, ignore_index=True)
        df['lyric']=df['lyric'].str.replace('\W', ' ')
        return df
        

In [6]:
Billie_Rogue = lyric
Billie_Rogue = Billie_Rogue.artist('https://www.lyrics.com/artist/Billie-Rogue/2137945805')

  df['lyric']=df['lyric'].str.replace('\W', ' ')


In [7]:
K_Ray = lyric
K_Ray = K_Ray.artist('https://www.lyrics.com/artist/K.-Ray/2137906147')

  df['lyric']=df['lyric'].str.replace('\W', ' ')


### Add a column to each DataFrame specifying artist category + Concatinate both

In [8]:
Billie_Rogue['Artist'] = 0
K_Ray['Artist'] = 1

In [9]:
frames = [Billie_Rogue, K_Ray]

df = pd.concat(frames, ignore_index=True)

### Using TFIDF for preprocess-tokenise-lemmatise-vectorise

In [10]:
vectorizer_tf = TfidfVectorizer(stop_words='english')
Y = vectorizer_tf.fit_transform(df['lyric'])
df_tfidf = pd.DataFrame(Y.toarray(),columns=vectorizer_tf.get_feature_names())

### Using NMF as matrix decomposition algorithms for topic extraction

In [11]:
model = NMF(n_components=15, init='random', random_state=0)
W = model.fit_transform(df_tfidf)
H = model.components_

In [12]:
components_df = pd.DataFrame(model.components_, columns=vectorizer_tf.get_feature_names())
components_df

Unnamed: 0,10,11,12,14,17,23,40,420,45,80,...,yard,yeah,year,years,yelling,yes,yo,younger,zimmerman,zones
0,0.0,0.0,0.0,0.0,0.0,0.0,0.004616,0.0,0.0,0.0,...,0.0,0.158182,0.0,0.004616,0.0,0.0,0.0,0.2942,0.0,0.0
1,0.0,0.0,0.0,0.0,4.3e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.2e-05,0.0,0.0,4.3e-05
2,0.0,0.0,0.0,0.0,0.0,0.0,0.010523,0.182415,0.0,0.0,...,0.0,1.251603,0.142029,0.010523,0.0,0.0,0.203572,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272621,...,0.272621,0.284074,0.0,0.0,0.0,0.545242,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0022,0.0,0.0,0.0,...,0.0,0.20475,0.0,0.0022,0.0,0.0,0.463966,0.0,0.0,0.0
5,1.420955e-08,0.0,0.0,0.0,0.0,1.733864,0.009198,0.0,0.0,0.0,...,0.0,0.009462,0.0,0.009198,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.549023,0.0,0.332174,0.0,0.0,0.0,...,0.0,1.311414,0.0,0.332174,0.0,0.0,0.61688,0.0,0.0,0.549023
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.231658,0.0,0.0,0.0,0.893385,0.0,0.0,0.0
8,1.365548e-08,0.0,0.0,0.316811,0.0,0.0,0.001805,0.0,0.0,0.0,...,0.0,0.001857,0.0,0.001805,0.0,0.0,0.0,0.0,0.0,0.0
9,0.1561116,1.081543e-08,0.0,0.0,0.003012,0.0,0.0,0.0,2.163086e-08,0.0,...,0.0,0.240929,0.0,0.0,1.081543e-08,0.0,0.473035,0.0,0.0,0.003012


### To evoke 10 most higher valued words for each lyric

In [16]:
for lyric in range(components_df.shape[0]):
    word = components_df.iloc[lyric]
    print(f'For topic {lyric+1} the words with the highest value are:')
    print(word.nlargest(10))
    print('\n')

For topic 1 the words with the highest value are:
trust       1.375492
head        1.375253
tired       1.176800
trying      1.077291
need        0.718750
ain         0.628550
humans      0.588400
sex         0.588400
trusting    0.588400
girl        0.579017
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
skies          1.017033
belong         0.726452
let            0.649190
closer         0.339656
fly            0.339630
rise           0.290581
silverlight    0.290581
step           0.290581
sunburn        0.290581
born           0.253059
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
smoke      2.736226
want       1.431980
yeah       1.251603
got        0.638203
bro        0.509955
gone       0.496246
gas        0.476448
blowing    0.475860
good       0.473133
words      0.425803
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
order       1.899348
tryna       1.899348
drive       1.635726
looking  

### Test with sample

In [133]:
new_sample = """We shooting dice in the section where the Bacardi at
Now the clerk mad at us yelling hurry up and buy """
 
# Transform the TF-IDF
X_new = vectorizer_tf.transform([new_sample])

# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X_new)
 
pd.DataFrame(nmf_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.0,0.0,0.005985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038132,0.0
