In [1]:
import nltk
nltk.download('stopwords')

  from scipy.stats import fisher_exact
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jackmetzger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import re
import spacy
from nltk.corpus import stopwords



In [3]:
df = pd.read_csv('Week_3/week_3_final_dataset.csv')

Removing the Live and Time columns as we no longer want to use it

In [4]:
df = df.drop(columns=['Live', 'Time'])

Slightly altered the week 3 eda notebook to re-catagorize subgenres that had fewer than 15 instances, now only 42 total subgenres exist

In [5]:
subgenre_counts = df['Subgenre'].value_counts()
subgenre_counts

Subgenre
hip hop              272
indie rock           207
indie pop            203
pop 80s              168
alternative rock     163
rock 70s             148
rock 80s             144
country              130
pop                  124
rock                 119
electronic            88
rnb                   86
pop rock              75
folk                  67
country pop           63
indie                 62
alternative           59
rock 60s              54
alternative rnb       51
dance                 49
indie folk            48
dance pop             46
pop 70s               46
pop dance             46
folk rock             45
soul                  45
rock alternative      43
hip hop rnb           36
soul 70s              35
rock 90s              31
electronic pop        28
pop 90s               27
electronic dance      27
soul rnb              23
pop 60s               22
indie alternative     19
pop synthpop          19
hip hop 90s           19
pop rnb               19
soul 80s        

Preparing the data for tokenization by cleaning the lyrics, also removing stop words and lemanizing the lyrics

In [6]:
# Load spaCy model and stopwords
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stop_words = set(stopwords.words("english"))

def clean_lyrics(text):
    if pd.isnull(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove [annotations like chorus/verse]
    text = re.sub(r"\[.*?\]", "", text)

    # Remove punctuation (except apostrophes)
    text = re.sub(r"[^a-z0-9'\s]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize with spaCy
    doc = nlp(text)

    # Lemmatize and remove stopwords
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and token.lemma_ not in stop_words]

    return " ".join(tokens)


In [7]:
df['Clean_Lyrics'] = df['Lyrics'].apply(clean_lyrics)

In [8]:
df = df.drop(columns='Lyrics')

In [9]:
df

Unnamed: 0,Song,Artist,Popularity,BPM,Dance,Energy,Acoustic,Happy,Loud,Camelot,Genre,Subgenre,Clean_Lyrics
0,I'm So Excited,The Pointer Sisters,65,92,69,86,10,69,-6,4B,pop,pop 80s,tonight 's night go make happen tonight put th...
1,Cheri Cheri Lady,Modern Talking,82,114,68,62,46,85,-14,6B,pop,pop 80s,oh explain every time oh feel real take heart ...
2,Give It Up,KC & The Sunshine Band,58,126,84,65,8,84,-12,5B,pop,pop,everybody want everybody want love would like ...
3,It's Raining Men - Single Version,The Weather Girls,50,136,66,93,46,46,-6,4A,pop,pop,hi hi weather girl uh huh get news well listen...
4,Take on Me,a-ha,89,84,57,90,2,88,-8,11A,pop,pop 80s,talk away know say say anyway today another da...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3016,Play That Funky Music,Wild Cherry,76,109,81,67,4,93,-12,11B,soul,soul disco,ahey huh yeah hey heyah boogie singer playing ...
3017,Rock with You - Single Version,Michael Jackson,82,114,81,54,18,85,-13,3B,pop,pop 80s,girl close eye let rhythm get try fight nothin...
3018,You Sexy Thing,Hot Chocolate,75,106,79,73,52,96,-5,7B,soul,soul disco,believe miracle sexy thing sexy thing believe ...
3019,Get It On,T. Rex,72,127,73,88,18,91,-7,10A,rock,rock 70s,well dirty sweet clad black look back love dir...


Split the data before feature normalization to prevent data leakage. 

In [10]:
# Split the data
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

Normalizing the numeric features on a 0 to 1 scale. Perform the normalization on the training data first. Then use the scalers from the training transformation to normalize the test and validation sets. This will prevent data leakage and prevent the model from over performing. 

In [11]:
# Select only numeric columns
numeric_cols = train_df.select_dtypes(include='number').columns

# Create a scaler and fit-transform the numeric columns
scaler = MinMaxScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])

In [12]:

# Calculate the scaled means
scaled_means = train_df[numeric_cols].mean()

# Create a scaler summary DataFrame
scaler_summary = pd.DataFrame({
    'feature': numeric_cols,
    'original_min': scaler.data_min_,
    'original_max': scaler.data_max_,
    'scaler': scaler.scale_,          # 1 / (max - min)
    'scaled_mean': scaled_means.values
})

# Display
scaler_summary.set_index('feature', inplace=True)
display(scaler_summary)



Unnamed: 0_level_0,original_min,original_max,scaler,scaled_mean
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Popularity,20.0,100.0,0.0125,0.588476
BPM,49.0,219.0,0.005882,0.424823
Dance,11.0,96.0,0.011765,0.565006
Energy,5.0,100.0,0.010526,0.624075
Acoustic,0.0,99.0,0.010101,0.235639
Happy,3.0,98.0,0.010526,0.525564
Loud,-26.0,-1.0,0.04,0.731693


In [13]:
# Apply the same transformation to validation and test sets
validation_df[numeric_cols] = scaler.transform(validation_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [14]:
# Save to CSV
train_df.to_csv("Week_4/week_4_train_set.csv", index=False)
test_df.to_csv("Week_4/week_4_test_set.csv", index=False)
validation_df.to_csv("Week_4/week_4_validation_set.csv", index=False)