In [None]:
### STEP 1 - immporting needed libraries ###

import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
from collections import Counter
from collections import defaultdict
import seaborn as sns

In [None]:
### STEP 2 - Loading the CSV files extracted earlier ###

df_artist = pd.read_csv("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Kaggle Datasets\\DS_1\\artists-data.csv")
df_lyrics = pd.read_csv("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Kaggle Datasets\\DS_1\\lyrics-data.csv")
df_spot60k = pd.read_csv("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Kaggle Datasets\\DS_2\\Spotify Million Song Dataset_exported.csv")
df_genius = pd.read_csv("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Kaggle Datasets\\DS_3\\song_lyrics.csv")
df_list = [df_artist,df_lyrics,df_spot60k,df_genius] #list for easy access to dataframes

In [None]:
### STEP 3 - Basic inspections of new dataframes ###

for df in df_list:
	print('Dataframe Information:')
	print(df.info())
	print(df.shape)
	print(df.columns)
	print('Dataframe Samples:')
	print(df.head())
	print()
	print()
	print("END OF DATASET INFORMATON AND SAMPLE")
	print()
	print()

In [None]:
### STEP 4 - Checking for missing values in dataframes ###

slice = 0
for df in df_list:
	df_names_list = ["df_artist","df_lyrics","df_spot60k","df_genius"]
	print(f'Missing values in {df_names_list[slice]}:')
	print(df.isnull().sum())
	slice +=1

In [None]:
### STEP 5 - Standardizing and adjusting column names ###

df_list[0] = df_artist.rename(columns={"Link":"Artist_Link"})
df_list[1] = df_lyrics.rename(columns={
	"ALink":"Artist_Link",
	"SName":"Song_Name",
	"SLink":"Song_Link",
	"Lyric":"Lyrics",
	"language":"Language"
	})
df_list[2] = df_spot60k.rename(columns={"song":"Song_Name","text":'Lyrics'})
df_list[3] = df_genius.rename(columns={
	"title":"Song_Name",
	"lyrics":"Lyrics",
	"language":"Language",
	"tag":"Genre"
	})
print("New column names for df_artist:")
print(df_list[0].columns)
print("New column names for df_lyrics:")
print(df_list[1].columns)
print("New column names for df_spot60k:")
print(df_list[2].columns)
print("New column names for df_genius:")
print(df_list[3].columns)

In [None]:
### STEP 6 - Adding genre information to spot60k dataset ###

df_list[2]["genre"]="Unknown"

In [None]:
### STEP 7 - Dropping columns not needed for analysis being done ###

df_list[0] = df_list[0].drop(columns=["Popularity"])
df_list[1] = df_list[1].drop(columns=["Song_Link"])
df_list[2] = df_list[2].drop(columns=["link","artist"])
df_list[3] = df_list[3].drop(columns=["artist","year","views","features","id","language_cld3","language_ft"])

In [None]:
### STEP 8 - Dropping rows with null values. ###

df_list[0] = df_list[0].dropna(subset=["Artist","Genres","Songs","Artist_Link"])
df_list[1] = df_list[1].dropna(subset=["Artist_Link","Song_Name","Lyrics","Language"])
df_list[3] = df_list[3].dropna(subset=["Song_Name","Language"])

In [None]:
### STEP 9 - Cleaning non-music values from df_genius dataframe ###

df_list[3] = df_list[3][df_list[3]["Genre"] != "misc"]

In [None]:
### STEP 10 - Establish a new merged dataframe linking df_artist and df_lyrics ###

df_merged = df_list[1].merge(df_list[0], on="Artist_Link", how="left")
df_list = [df_merged, df_list[2], df_list[3]]
print(df_list[0].info())
print(df_list[0].head())
print(df_list[0].isnull().sum())
print(df_list[0].shape)
print(df_list[0]["Language"].value_counts()) #checking unique values of "Language" column

In [None]:
### STEP 11 - Cleaning up the new merged dataframe ###

df_list[0] = df_list[0].dropna(subset=["Artist","Genres","Songs"]) #drop rows with missing values
df_list[0] = df_list[0].drop(columns=["Songs"]) #drop column "Songs" from merged dataframe
df_list[0] = df_list[0][df_list[0]["Language"]=="en"] #ensure only english language songs are in the dataframe

In [None]:
### STEP 12 - Remaining steps for cleaning and normalizing data ###

df_list[2] = df_list[2][df_list[2]["Language"]=="en"] #drop non-english data from genius dataframe
df_list[1]["Language"]="en" #adds a language column to spot60k data (all "unknown")
df_list[0] = df_list[0].drop(columns=["Artist_Link","Artist"]) #droping "artist link" data from merged dataset
df_list[0] = df_list[0].rename(columns={"Genres":"Genre"}) # Normalizing column names
df_list[1] = df_list[1].rename(columns={"genre":"Genre"})

In [None]:
### STEP 13 - Forming concattinated master dataframe from all the cleaned data and checking features of the newly formed dataframe ###

df_master = pd.concat([df_list[0],df_list[1],df_list[2]], ignore_index=True)
print("COLUMNS IN DF_MASTER:")
print(df_master.columns)
print("NUMBER OF NULL VALUES IN DF_MASTER:")
print(df_master.isnull().sum())
print("INFORMATION FOR DF_MASTER:")
print(df_master.info())
print()
print("Duplicates in 'Song_Name' column: ", df_master.duplicated(subset="Song_Name").sum())
print("Duplicates in 'Lyrics' column: ", df_master.duplicated(subset="Lyrics").sum())
print("Duplicates in 'Song_Name' and 'Lyrics' column: ", df_master.duplicated(subset=["Song_Name","Lyrics"]).sum())
print("VALUE COUNTS PER COLUMN OF DF_MASTER:")
print("Value counts for 'Language' column:", df_master["Language"].value_counts())
print("Value counts for 'Genre' column:", df_master["Genre"].value_counts().to_string())

In [None]:
### STEP 14 - Cleaning new master dataframe ###

# Function for preprocessing lyrics for analysis
stop_words = set(stopwords.words("english")).union({"im","ive","youre","youve","yeah","oh","get","gonna","aint","uh","ha","wanna","la","hey","woah","whoa","ooh","mmm"})
def preprocess_lyrics(text):
	if isinstance(text, str):
		text = text.lower()
		text = re.sub(r"[^\w\s]", "", text)
		text = re.sub(r"\d+", "", text)
		words = text.split()
		words = [word for word in words if word not in stop_words]
		return " ".join(words)
	return text

# Function for processing and cleaning entries in the 'genre' column of the master dataframe
genre_list = ['rock','pop','indie','rap','folk','blues','country','metal','electronic']
def clean_genre(genre_string):
	if isinstance(genre_string, str):
		genres = [g.strip().lower() for g in genre_string.split(';')]
		for g in genres:
			if g in genre_list:
				return g
	return 'other'

df_master["Lyrics_Clean"] = df_master["Lyrics"].apply(preprocess_lyrics) #cleaning up lyric data and inserting clean lyric data into new column
df_master["Genre_Clean"] = df_master["Genre"].apply(clean_genre) #normalizes genre lables, and reasigns song entries with multiple lables a single genre lable based on first match to a desired list (genre_list)
df_master_cln = df_master[["Song_Name","Lyrics_Clean","Genre_Clean"]].copy() #creates a copy of the master data frame containing only the cleaned data and song names.
df_master_cln = df_master_cln.drop(columns=["Song_Name"]) #drops song_name data column from clean copy of master dataframe

In [None]:
### STEP 15 Performing word frequency analysis by genre ###

# Function for preformance of word frequency anlaysis of words in different genres
def word_frequency_by_genre(df,genre,top_n=25):
	genre_lyrics = " ".join(df[df["Genre_Clean"] == genre]["Lyrics_Clean"]) #Gather all the lyrics of the specified genre and set them into on large string object
	words = genre_lyrics.split() #convert string into list of words
	word_counts = Counter(words) #count word frequency in genre
	return word_counts.most_common(top_n) #return 20 most common words in genre

# Function that performs frequency analysis for each genre
def freq_by_genre(df,genre_list,top_n=25):
	result ={}
	for genre in genre_list:
		result[genre] = word_frequency_by_genre(df, genre, top_n)
	return result

genre_list_2 = ['rock','pop','indie','rap','folk','blues','country','metal','electronic','other'] #redefine genre_list to include "other"
freq_dict = freq_by_genre(df_master_cln,genre_list_2) #run frequency analysis on each genre in dataframe, returns top 25 results, stores results in dictionary
for genre in genre_list_2:
	genre_words = freq_dict[genre]
	words, counts = zip(*genre_words)
	plt.figure(figsize=(10,10))
	plt.barh(words,counts)
	plt.gca().invert_yaxis()
	plt.xlabel("Occurences in Lyric Data")
	plt.ylabel("Top 25 Words")
	plt.title(f"Top 25 Words for {genre.capitalize()}")
	plt.tight_layout()
	plt.show()

In [None]:
### STEP 16 - Loading in dataset for EMOlex and converting it's contents into a format useful for running analysis

df_emolex = pd.read_csv("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\NRC-Emotion-Lexicon\\NRC-Emotion-Lexicon\\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
	sep="\t", engine="python", header=None, names=["word","emotion","value"])

word_associations = defaultdict(list)
for _, row in df_emolex.iterrows():
	if row["value"]==1:
		word_associations[row["word"]].append(row["emotion"])

In [None]:
### STEP 17 - Finding data on the emotional associations of the lyric data
############# STEP 17.1 - Preparing for the analysis

df_master_cln["Row_ID"] = df_master_cln.index #creates a column of unique identifiers for each entry in the dataframe
df_scores = pd.DataFrame(df_master_cln["Row_ID"].copy()) #generates a new dataframe with the same amout rows as the cleaned master dataframe. both dataframes can be linked by the "Row_ID" key
emo_tag_list = ["anger","anticipation","disgust","fear","joy","negative","positive","sadness","surprise","trust"] #list of unique emotional tags in EMOlex associations list
for tag in emo_tag_list: #expands df_scores to now have a column for each word in emo_tag_list. All are initialized to 0
	df_scores[tag] = 0

In [None]:
############# STEP 17.2 - Scoring the lyrics data

def score_lyrics(row_n): #define new function "score_lyrics". Function takes in one variable, an row number
	token_list = df_master_cln.loc[row_n,"Lyrics_Clean"].split() #create a list of word tokens from the lyric string for row_n
	emotion_counter = {"anger":0,"anticipation":0,"disgust":0,"fear":0,"joy":0,"negative":0,"positive":0,"sadness":0,"surprise":0,"trust":0}
	for word in token_list: #iterate through created list of word tokens
		if word in word_associations: #if a token is in the word_associations dictionary defined in step 16:
			word_key_values = word_associations[word] #pull the value for that word. This value is a list of associated emotions for that word
			for emotion in word_key_values: #iterate through this list of emotions associated with that word
				emotion_counter[emotion] +=1
		else:
			pass
	for tag in emo_tag_list:
		df_scores.loc[row_n, tag] = emotion_counter[tag]

for r in range(len(df_master_cln)):
	score_lyrics(r)

In [None]:
############# STEP 17.3 - Creating project checkpoint. Saving all created dataframes

save_path_pkl = "F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Saved Dataframes\\pickles\\"
save_path_csv = "F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Saved Dataframes\\CSVs\\"

#saving to .pkl
df_master.to_pickle(save_path_pkl + "df_master.pkl")
df_master_cln.to_pickle(save_path_pkl + "df_master_cln.pkl")
df_scores.to_pickle(save_path_pkl + "df_scores.pkl")
#saving to .csv
df_master.to_csv(save_path_csv + "df_master.csv")
df_master_cln.to_csv(save_path_csv + "df_master_cln.csv")
df_scores.to_csv(save_path_csv + "df_scores.csv")

In [None]:
############# STEP 17.4 - Normalizing the values across the rows in df_scores

df_scores_normalized = pd.DataFrame(df_scores.copy())
print("New dataframe created: 'df_scores_normalized'")
print("Data normalization initiated:")
def normalize_scores(row_n):
	scores_list = []
	normal_scores = {"anger":0,"anticipation":0,"disgust":0,"fear":0,"joy":0,"negative":0,"positive":0,"sadness":0,"surprise":0,"trust":0}
	for tag in emo_tag_list:
		scores_list.append(df_scores_normalized.loc[row_n,tag])
	score_range = (max(scores_list)-min(scores_list))
	if score_range == 0:
		pass
	else:
		for tag in emo_tag_list:
			normal_scores[tag] = ((df_scores_normalized.loc[row_n,tag] - min(scores_list)) / score_range)
	for tag in emo_tag_list:
		df_scores_normalized.loc[row_n, tag] = normal_scores[tag]

for r in range(len(df_scores)):
	if r % 100000 == 0:
		print(f"Rows processed : {r}")
	normalize_scores(r)
print("Normalization complete!")
print("Printing sample ...")
print(df_scores_normalized.head(20))

df_scores_normalized.to_pickle(save_path_pkl + "df_scores_normalized.pkl")
df_scores_normalized.to_csv(save_path_csv + "df_scores_normalized.csv")
print("This data has been stored as a .pkl file at:")
print("		" + save_path_pkl)
print("This data has been stored as a .csv file at:")
print("		" + save_path_csv)

In [None]:
##########Reloading pickeld dataframes##########
NormScore_reloaded = pd.read_pickle("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Saved Dataframes\\pickles\\df_scores_normalized.pkl")
Master_cln_reloaded = pd.read_pickle("F:\\Docs\\personal\\projects\\Sentiment Analysis Project\\Saved Dataframes\\pickles\\df_master_cln.pkl")
##########Reloading pickeld dataframes##########

In [None]:
### STEP 18 - Merging normalized scores and genre data from cleaned master data frame ###

df_final = NormScore_reloaded.merge(Master_cln_reloaded[["Row_ID","Genre_Clean"]], on="Row_ID", how="left")

In [None]:
### STEP 19 - Calculating the mean scores for each emotion and grouping by genre ###

df_heatmap = df_final.groupby("Genre_Clean")[emo_tag_list].mean()

In [None]:
### STEP 20 - Generating a heatmap that shows the relationship of different genres to different emotion categories visually ###

plt.figure(figsize=(12, 6))
sns.heatmp(
	df_heatmap,
	cmap = "YlGnBu",
	annot = True,
	fmt = ".2f",
	cbar = "True")

plt.title("Average Emotional Profile of Genre")
plt.xlabel("Emotion")
plt.ylabel("Genre")
plt.tight_layout()
plt.show()