<h1 align="center">Sentiment Analysis of Music Lyrics Using NLP Techniques</h1>

## 1. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud 

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import warnings
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [2]:
songs_df = pd.read_csv('../datasets/musicoset_metadata/songs.csv', sep='\t')
lyrics_df = pd.read_csv('../datasets/musicoset_songfeatures/lyrics.csv', sep='\t')

## 3. Dataset Preview

In [3]:
songs_df.head()

Unnamed: 0,song_id,song_name,billboard,artists,popularity,explicit,song_type
0,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next","('Thank U, Next', 'Ariana Grande')",{'66CXWjxzNUsdJxJ2JdwvnR': 'Ariana Grande'},86,True,Solo
1,5p7ujcrUXASCNwRaWNHR1C,Without Me,"('Without Me', 'Halsey')",{'26VFTg2z8YR0cCuwLzESi2': 'Halsey'},87,True,Solo
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,"('Sicko Mode', 'Travis Scott')",{'0Y5tJX1MQlPlqiwlOH1tJY': 'Travis Scott'},85,True,Solo
3,3KkXRkHbMCARz0aVfEt68P,Sunflower - Spider-Man: Into the Spider-Verse,('Sunflower (Spider-Man: Into The Spider-Verse...,"{'246dkjvS1zLTtiykXe5h60': 'Post Malone', '1zN...",92,False,Collaboration
4,1rqqCSm0Qe4I9rUvWncaom,High Hopes,"('High Hopes', 'Panic! At The Disco')",{'20JZFwl6HVl6yg8a4H3ZqK': 'Panic! At The Disco'},86,False,Solo


In [4]:
lyrics_df.head()

Unnamed: 0,song_id,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


In [5]:
print(f"Shape of songs dataset: {songs_df.shape}")
print(f"Shape of lyrics dataset: {lyrics_df.shape}")

Shape of songs dataset: (20405, 7)
Shape of lyrics dataset: (20404, 2)


In [6]:
songs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20405 entries, 0 to 20404
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   song_id     20405 non-null  object
 1   song_name   20405 non-null  object
 2   billboard   20405 non-null  object
 3   artists     20405 non-null  object
 4   popularity  20405 non-null  int64 
 5   explicit    20405 non-null  bool  
 6   song_type   20405 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 976.5+ KB


In [7]:
lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20404 entries, 0 to 20403
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   song_id  20404 non-null  object
 1   lyrics   19663 non-null  object
dtypes: object(2)
memory usage: 318.9+ KB


## 4. Data Wrangling

### 4.1. Merge songs with lyrics

In [8]:
merged_df = pd.merge(songs_df, lyrics_df, on='song_id')

In [9]:
merged_df.shape

(20404, 8)

### 4.2. Drop all musics without lyrics

In [10]:
merged_df = merged_df.dropna()

### 4.3. Format artist names

In [11]:
def clean_artist(artist):
    # Split the string by ':'
    parts = artist.split(':')
    if len(parts) > 1:
        # Take the second part, strip any leading/trailing whitespace, and slice from the second to the second-last character
        return parts[1].strip()[1:-2]
    else:
        # Return the original string if the split did not work as expected
        return artist.strip()

In [12]:
merged_df['artists'] = merged_df['artists'].apply(clean_artist)

In [13]:
merged_df.head()

Unnamed: 0,song_id,song_name,billboard,artists,popularity,explicit,song_type,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next","('Thank U, Next', 'Ariana Grande')",Ariana Grande,86,True,Solo,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,Without Me,"('Without Me', 'Halsey')",Halsey,87,True,Solo,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,"('Sicko Mode', 'Travis Scott')",Travis Scott,85,True,Solo,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
4,1rqqCSm0Qe4I9rUvWncaom,High Hopes,"('High Hopes', 'Panic! At The Disco')",Panic! At The Disco,86,False,Solo,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."
5,0bYg9bo50gSsH3LtXe2SQn,All I Want for Christmas Is You,"('All I Want For Christmas Is You', 'Mariah Ca...",Mariah Carey,63,False,Solo,"[""[Intro]\nI-I-I don't want a lot for Christma..."


### 4.4. Format song lyrics

In [14]:
merged_df['lyrics'].values[0]

'[\'[Verse 1]\\nThought I\\\'d end up with Sean\\nBut he wasn\\\'t a match\\nWrote some songs about Ricky\\nNow I listen and laugh\\nEven almost got married\\nAnd for Pete, I\\\'m so thankful\\nWish I could say, "Thank you" to Malcolm\\n\\\'Cause he was an angel\\n\\n[Pre-Chorus]\\nOne taught me love\\nOne taught me patience\\nAnd one taught me pain\\nNow, I\\\'m so amazing\\nSay I\\\'ve loved and I\\\'ve lost\\nBut that\\\'s not what I see\\nSo, look what I got\\nLook what you taught me\\nAnd for that, I say\\n\\n[Chorus]\\nThank you, next (Next)\\nThank you, next (Next)\\nThank you, next\\nI\\\'m so fuckin\\\' grateful for my ex\\nThank you, next (Next)\\nThank you, next (Next)\\nThank you, next (Next)\\nI\\\'m so fuckin\\\'—\\n\\n[Verse 2]\\nSpend more time with my friends\\nI ain\\\'t worried \\\'bout nothin\\\'\\nPlus, I met someone else\\nWe havin\\\' better discussions\\nI know they say I move on too fast\\nBut this one gon\\\' last\\n\\\'Cause her name is Ari\\nAnd I\\\'m so go

In [15]:
stop_words = stopwords.words('english')

print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
# Extend stopwords
stop_words.extend([
    'verse', 'chorus', 'i"ll', 'intro', 'outro', 'or', 'm', 'ma', 'ours', 'against', 'nor',
    'wasn', 'hasn', 'my', 'had', 'didn', 'isn', 'did', 'aren', 'those', 'than', 
    "mustn't", "you've", 'to', 'she', 'having', "haven't", 'into', 't', 'll', 
    'himself', 'do', "that'll", 'so', 'of', 'on', 'very', 'for', 'out', 'were', 
    'should', 'they', 'ain', "should've", 'you', "didn't", 'yours', 'was', 'our',
     'can', 'myself', "shouldn't", 'have', 'up', 'mightn', "you'll", 'any', 
    'itself', 'hadn', 'him', 'doesn', 'weren', 'y', 'being', "don't", 'them', 
    'are','and', 'that', 'your', 'yourself', 'their', 'some', 'ourselves', 've', 
    'doing', 'been', 'shouldn', 'yourselves', "mightn't", 'most', 'because',
     'few', 'wouldn', "you'd", 'through', "you're", 'themselves', 'an', 'if',
     "wouldn't", 'its', 'other', "won't", "wasn't", "she's", 'we', 'shan',
     "weren't",'don',"hadn't", 'this', 'off', 'while', 'a', 'haven', 'her', 
    'theirs', 'all', "hasn't", "doesn't", 'about', 'then', 'by','such', 'but', 
    'until', 'each', 'there', "aren't", 'with', 'not', "shan't", 'hers', 'it', 
    'too', 'i', 'at', 'is', 'as', 'me', 'herself', 's', 'the', 'where', 'am', 
    'has', 'over', "couldn't", 'when', 'does', 'mustn','re', 'no', 'in', 'who', 
    'd', 'own', 'he', 'be', "isn't", 'his', 'these', 'same', 'whom', 'will', 
    'needn','couldn', 'from',  "it's", 'o',
])

stop_words = set(stop_words)

In [17]:
# Pre-compile the regex pattern for removing escaped new line characters
newline_pattern = re.compile(r'\\n')

# Initialize the lemmatizer and stop words once
lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

def preprocess_lyrics(lyrics):
    # Remove escaped new line character
    lyrics = newline_pattern.sub('', lyrics)
    
    # Tokenization
    tokens = word_tokenize(lyrics)
    
    # Process tokens: remove punctuation, convert to lowercase, remove stopwords, and lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(word.lower())
        for word in tokens if word.isalnum() and word.lower() not in stop_words
    ]
    
    return ' '.join(processed_tokens)

In [18]:
merged_df['cleaned_lyrics'] = merged_df['lyrics'].apply(preprocess_lyrics)

In [19]:
merged_df['cleaned_lyrics']

0        1 thought end seanbut matchwrote song rickynow...
1        1 found heart brokei filled cup overflowedtook...
2        part drake astro yeahsun already know heremy d...
4        high high hope high high hope livingshooting s...
5        want lot christmasthere one thing needi care p...
                               ...                        
20398    ten little indiansstanding lineone stood looki...
20399    surein world constantly changing surewhere sta...
20400    1 billy wood used clown kid likescared money m...
20401    well better run run saysomebody calling youyou...
20402    round 1 saurus people salty thinking battle in...
Name: cleaned_lyrics, Length: 19663, dtype: object