In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download()

In [None]:
# loads lyrics data from json file
file = open("lyrics.json")
raw_text = file.read()
file.close()
lst = raw_text.split("}\n{")
new_raw_text = ",".join(lst)

df = pd.read_json(new_raw_text, orient="index")
df["key"] = df.index  # Create a new column "key" and assign the current index values
df[["artist", "name"]] = df["key"].str.split("_", 1, expand=True)  # Split the "key" column into "artist" and "song-name"
column_names = list(df.columns)
column_names[0] = "lyrics"
df.columns = column_names
df = df.reset_index(drop=True)
df = df[["artist", "name", "lyrics"]]

# declutters the lyrics
# - removes the Contributor(s) noise
# - removes noise between square brackets
# - removes all non-alphanumeric characters (including white space and "!|?")
# - converts all characters to lowercase
df["decluttered_lyrics"] = df["lyrics"]\
    .str.replace(r".*Contributors", "", regex=True)\
    .str.replace(r".*Contributor", "", regex=True)\
    .str.replace(r"\[.*?\]", "", regex=True)\
    .str.replace(r"[^A-Za-z0-9\s!?]", "")\
    .str.lower()
df

In [None]:
# tokenize words
df["tokens"] = df["decluttered_lyrics"].str.replace(r"[^a-z\s]", "").apply(lambda x: word_tokenize(x))

In [None]:
# remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

df["filtered_tokens"] = df["tokens"].apply(remove_stopwords)

In [None]:
# j the filtered tokens back into sentences
df["filtered_text"] = df["filtered_tokens"].apply(lambda tokens: " ".join(tokens))

In [None]:
# sentiment analysis
sia = SentimentIntensityAnalyzer()
df["sentiment_scores"] = df["filtered_text"].apply(lambda x: sia.polarity_scores(x))
df = pd.concat([df.drop(["sentiment_scores"], axis=1), df["sentiment_scores"].apply(pd.Series)], axis=1) # make sentiment scores into columns

In [None]:
# tfidf analysis 
tfidf = TfidfVectorizer()
df["tfidf"] = list(tfidf.fit_transform(df["filtered_text"]).toarray())

In [None]:
df