In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import sentiment
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_excel('./billboard_merge.xlsx')
len(df)

2446

In [3]:
df

Unnamed: 0,순위,년도,월,제목,가수,장르,재생시간,가사
0,1,2020,11,Mood,24kGoldn Featuring iann dior,POP / 랩/힙합,02:21,[Chorus: 24kGoldn]\nWhy you always in a mood?\...
1,2,2020,11,Positions,Ariana Grande,POP / 팝,02:52,Heaven sent you to me\nI'm just hopin’ I don't...
2,3,2020,11,I Hope,Gabby Barrett Featuring Charlie Puth,POP / 블루스/포크/컨트리,03:30,"I, I hope she makes you smile\nThe way it made..."
3,4,2020,11,Laugh Now Cry Later,Drake Featuring Lil Durk,POP / 랩/힙합,04:22,"Woah, woah\nYeah\n\nSometimes we laugh\nand so..."
4,5,2020,11,Blinding Lights,The Weeknd,POP / R&B/소울,03:22,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...
...,...,...,...,...,...,...,...,...
2441,89,2023,10,On My Mama,Victoria Monet,POP / R&B/소울,03:07,"[Verse 1: Victoria Monét]\nWhen they say, ""She..."
2442,91,2023,10,Call Your Friends,Rod Wave,POP / 랩/힙합,02:33,"Mm, I been up for three nights\ntryna plan out..."
2443,92,2023,10,Tourniquet,Zach Bryan,POP / 블루스/포크/컨트리,03:09,There’s delays on the planes\nout of eastern M...
2444,93,2023,10,Come See Me,Rod Wave,POP / 랩/힙합,03:11,I don't know if you love me anymore\nI don't k...


In [4]:
df2=df[['가사']]

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk



stop_words = set(stopwords.words('english'))

#가사를 분석하기 쉽게 전처리
def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

df2['processed_lyrics'] = df['가사'].apply(preprocess_text)

# Display the first few preprocessed lyrics to ensure preprocessing was successful
df2['processed_lyrics'].head()

0    chorus kgoldn always mood fuckin round actin b...
1    heaven sent hopin dont repeat history boy tryn...
2    hope makes smile way made smile end phone midd...
3    woah woah yeah sometimes laugh sometimes cry g...
4    intro yeah verse ive tryna call ive long enoug...
Name: processed_lyrics, dtype: object

In [6]:
#가사 전체 수, 단어,줄수, 어휘 밀도
df2['#characters'] = df2['processed_lyrics'].str.len()
df2['#words'] = df2['processed_lyrics'].str.split().str.len()
df2['#lines'] = df2['가사'].str.split('\n').str.len()
df2['#uniq_words'] = df2['processed_lyrics'].apply(lambda x: len(set(x.split())))
df2['lexical_density'] = df2['#uniq_words'] / df2['#words']

In [7]:
df2.head()

Unnamed: 0,가사,processed_lyrics,#characters,#words,#lines,#uniq_words,lexical_density
0,[Chorus: 24kGoldn]\nWhy you always in a mood?\...,chorus kgoldn always mood fuckin round actin b...,1369,229,84,90,0.393013
1,Heaven sent you to me\nI'm just hopin’ I don't...,heaven sent hopin dont repeat history boy tryn...,1014,159,84,53,0.333333
2,"I, I hope she makes you smile\nThe way it made...",hope makes smile way made smile end phone midd...,1223,219,74,78,0.356164
3,"Woah, woah\nYeah\n\nSometimes we laugh\nand so...",woah woah yeah sometimes laugh sometimes cry g...,1578,281,111,144,0.512456
4,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...,intro yeah verse ive tryna call ive long enoug...,762,136,76,63,0.463235


In [11]:
from nltk import ngrams

In [16]:
#감성분석 진행을 위한 라이브러리를 불러옴
senti_analyze = sentiment.vader.SentimentIntensityAnalyzer()
senti_analyze.polarity_scores(df2.iloc[0,1])

{'neg': 0.162, 'neu': 0.603, 'pos': 0.236, 'compound': 0.9349}

In [17]:
#감성분석을 진행
df2['sentiment_score'] = pd.DataFrame(df2['processed_lyrics'].apply(senti_analyze.polarity_scores).tolist())['compound']
df2['sentiment'] = pd.cut(df2['sentiment_score'], [-np.inf, -0.35, 0.35, np.inf], labels=['negative', 'neutral', 'positive'])

In [18]:
df2.head()

Unnamed: 0,가사,processed_lyrics,#characters,#words,#lines,#uniq_words,lexical_density,sentiment_score,sentiment
0,[Chorus: 24kGoldn]\nWhy you always in a mood?\...,chorus kgoldn always mood fuckin round actin b...,1369,229,84,90,0.393013,0.9349,positive
1,Heaven sent you to me\nI'm just hopin’ I don't...,heaven sent hopin dont repeat history boy tryn...,1014,159,84,53,0.333333,0.9882,positive
2,"I, I hope she makes you smile\nThe way it made...",hope makes smile way made smile end phone midd...,1223,219,74,78,0.356164,0.999,positive
3,"Woah, woah\nYeah\n\nSometimes we laugh\nand so...",woah woah yeah sometimes laugh sometimes cry g...,1578,281,111,144,0.512456,-0.9705,negative
4,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...,intro yeah verse ive tryna call ive long enoug...,762,136,76,63,0.463235,0.8065,positive


In [20]:
#토픽모델링을 진행 과정
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 7

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(df2['processed_lyrics'])

lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0)

lda_topics = lda_model.fit_transform(tfidf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15

display_topics(lda_model, tfidf_vectorizer.get_feature_names_out(), no_top_words)

Topic 0:
que wasted big energy like daylight como por got bitch fancy thats gang dick ayy
Topic 1:
dont youre good know oohooh think heat yeah happier baby waves fakin night wan june
Topic 2:
like know need love woman got baby yeah stay wan dont youre bad mmm hold
Topic 3:
yeah dance like want love wan levitating youre wait woahoh night need baby dont ooh
Topic 4:
love yeah hey like god baby right night aint ooh said got thank ill yuh
Topic 5:
yeah got know like dont want youre love aint tell bitch cause shit baby ayy
Topic 6:
yeah shit know dont got like aint bitch baby life nigga love miss better right


In [21]:
#각 토픽에 토픽 번호 부여
topic_distribution = pd.DataFrame(lda_topics)
df2['topic_num'] = topic_distribution.idxmax(axis=1)
df2.head()

Unnamed: 0,가사,processed_lyrics,#characters,#words,#lines,#uniq_words,lexical_density,sentiment_score,sentiment,topic_num
0,[Chorus: 24kGoldn]\nWhy you always in a mood?\...,chorus kgoldn always mood fuckin round actin b...,1369,229,84,90,0.393013,0.9349,positive,2
1,Heaven sent you to me\nI'm just hopin’ I don't...,heaven sent hopin dont repeat history boy tryn...,1014,159,84,53,0.333333,0.9882,positive,1
2,"I, I hope she makes you smile\nThe way it made...",hope makes smile way made smile end phone midd...,1223,219,74,78,0.356164,0.999,positive,5
3,"Woah, woah\nYeah\n\nSometimes we laugh\nand so...",woah woah yeah sometimes laugh sometimes cry g...,1578,281,111,144,0.512456,-0.9705,negative,5
4,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...,intro yeah verse ive tryna call ive long enoug...,762,136,76,63,0.463235,0.8065,positive,4


In [27]:
#각 단어 분포를 통해 주제 선정
df2['노래주제'] = df2['topic_num'].replace({0: '자기 자신에 대한 확신',1: ' 복잡하고 혼란스러운 감정'
                                   ,2: '사랑',3: '즐거움과 기쁨'
                                   ,4: '애정과 따듯함',5: 'Open(개방적인)'
                                   ,6: '인생'})

In [28]:
df2

Unnamed: 0,가사,processed_lyrics,#characters,#words,#lines,#uniq_words,lexical_density,sentiment_score,sentiment,topic_num,노래주제
0,[Chorus: 24kGoldn]\nWhy you always in a mood?\...,chorus kgoldn always mood fuckin round actin b...,1369,229,84,90,0.393013,0.9349,positive,2,사랑
1,Heaven sent you to me\nI'm just hopin’ I don't...,heaven sent hopin dont repeat history boy tryn...,1014,159,84,53,0.333333,0.9882,positive,1,복잡하고 혼란스러운 감정
2,"I, I hope she makes you smile\nThe way it made...",hope makes smile way made smile end phone midd...,1223,219,74,78,0.356164,0.9990,positive,5,Open(개방적인)
3,"Woah, woah\nYeah\n\nSometimes we laugh\nand so...",woah woah yeah sometimes laugh sometimes cry g...,1578,281,111,144,0.512456,-0.9705,negative,5,Open(개방적인)
4,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...,intro yeah verse ive tryna call ive long enoug...,762,136,76,63,0.463235,0.8065,positive,4,애정과 따듯함
...,...,...,...,...,...,...,...,...,...,...,...
2441,"[Verse 1: Victoria Monét]\nWhen they say, ""She...",verse victoria monét say get mama ima say fuck...,1503,277,77,90,0.324910,0.9977,positive,5,Open(개방적인)
2442,"Mm, I been up for three nights\ntryna plan out...",three nights tryna plan life livin wrong wan r...,984,166,49,116,0.698795,-0.8299,negative,5,Open(개방적인)
2443,There’s delays on the planes\nout of eastern M...,theres delays planes eastern montana told leav...,860,139,57,83,0.597122,0.7202,positive,2,사랑
2444,I don't know if you love me anymore\nI don't k...,dont know love anymore dont know love like tel...,918,159,61,93,0.584906,-0.9889,negative,6,인생


In [29]:
df['가사 전처리']=df2['processed_lyrics']
df['가사 글자']=df2['#characters']
df['가사 단어']=df2['#words']
df['가사 줄']=df2['#lines']
df['가사 고유 단어']=df2['#uniq_words']
df['어휘 밀도']=df2['lexical_density']
df['가사 감정 점수']=df2['sentiment_score']
df['가사 감정']=df2['sentiment']
df['topic 번호']=df2['topic_num']
df['노래주제']=df2['노래주제']


In [30]:
df

Unnamed: 0,순위,년도,월,제목,가수,장르,재생시간,가사,가사 전처리,가사 글자,가사 단어,가사 줄,가사 고유 단어,어휘 밀도,가사 감정 점수,가사 감정,topic 번호,노래주제
0,1,2020,11,Mood,24kGoldn Featuring iann dior,POP / 랩/힙합,02:21,[Chorus: 24kGoldn]\nWhy you always in a mood?\...,chorus kgoldn always mood fuckin round actin b...,1369,229,84,90,0.393013,0.9349,positive,2,사랑
1,2,2020,11,Positions,Ariana Grande,POP / 팝,02:52,Heaven sent you to me\nI'm just hopin’ I don't...,heaven sent hopin dont repeat history boy tryn...,1014,159,84,53,0.333333,0.9882,positive,1,복잡하고 혼란스러운 감정
2,3,2020,11,I Hope,Gabby Barrett Featuring Charlie Puth,POP / 블루스/포크/컨트리,03:30,"I, I hope she makes you smile\nThe way it made...",hope makes smile way made smile end phone midd...,1223,219,74,78,0.356164,0.9990,positive,5,Open(개방적인)
3,4,2020,11,Laugh Now Cry Later,Drake Featuring Lil Durk,POP / 랩/힙합,04:22,"Woah, woah\nYeah\n\nSometimes we laugh\nand so...",woah woah yeah sometimes laugh sometimes cry g...,1578,281,111,144,0.512456,-0.9705,negative,5,Open(개방적인)
4,5,2020,11,Blinding Lights,The Weeknd,POP / R&B/소울,03:22,[Intro]\nYeah\n\n[Verse 1]\nI've been tryna ca...,intro yeah verse ive tryna call ive long enoug...,762,136,76,63,0.463235,0.8065,positive,4,애정과 따듯함
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,89,2023,10,On My Mama,Victoria Monet,POP / R&B/소울,03:07,"[Verse 1: Victoria Monét]\nWhen they say, ""She...",verse victoria monét say get mama ima say fuck...,1503,277,77,90,0.324910,0.9977,positive,5,Open(개방적인)
2442,91,2023,10,Call Your Friends,Rod Wave,POP / 랩/힙합,02:33,"Mm, I been up for three nights\ntryna plan out...",three nights tryna plan life livin wrong wan r...,984,166,49,116,0.698795,-0.8299,negative,5,Open(개방적인)
2443,92,2023,10,Tourniquet,Zach Bryan,POP / 블루스/포크/컨트리,03:09,There’s delays on the planes\nout of eastern M...,theres delays planes eastern montana told leav...,860,139,57,83,0.597122,0.7202,positive,2,사랑
2444,93,2023,10,Come See Me,Rod Wave,POP / 랩/힙합,03:11,I don't know if you love me anymore\nI don't k...,dont know love anymore dont know love like tel...,918,159,61,93,0.584906,-0.9889,negative,6,인생


In [31]:
df.to_excel('./billboard_result.xlsx', index = False)