# Data Collection and Data Preprocessing 

In this Jupyter notebook, we will discuss the data collection and preprocessing code, along with the steps that have been implemented.

## Code to Access Genius API to get lyrics of Songs 

In [None]:
import pandas as pd
import lyricsgenius
import time

file_path = "Your File Path "
df = pd.read_csv(file_path)

GENIUS_ACCESS_TOKEN = "Your Access token from genius "
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)


df["lyrics"] = None

def fetch_lyrics(song_name):
    try:
        song = genius.search_song(song_name)
        if song:
            return song.lyrics
    except Exception as e:
        print(f"⚠️ Error fetching lyrics for {song_name}: {e}")
    return None

for index, row in df.iterrows():
    song_name = row["song"]  
    print(f"🎵 Fetching lyrics for: {song_name}...")
    
    df.at[index, "lyrics"] = fetch_lyrics(song_name)
    
    time.sleep(1)

output_file = "songs_with_lyrics.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"✅ Lyrics saved to `{output_file}`!")

## Code to merge all datasets, Perform Sentiment Analysis and drop null and duplicate values 

In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from scipy import stats

nltk.download("vader_lexicon")

spotify_2018 = pd.read_csv("SpotifyAudioFeaturesNov2018.csv")
spotify_2019 = pd.read_csv("SpotifyAudioFeaturesApril2019.csv")
lyrics_df = pd.read_csv("spotify_millsongdata 2.csv")
genius_lyrics_df = pd.read_csv("songs_with_lyrics.csv")

spotify_df = pd.concat([spotify_2018, spotify_2019], ignore_index=True)

spotify_df.rename(columns={"track_name": "song"}, inplace=True)

for df in [spotify_df, lyrics_df, genius_lyrics_df]:
    df["song"] = df["song"].str.lower().str.strip()

all_lyrics = pd.concat([lyrics_df, genius_lyrics_df], ignore_index=True)

merged_df = pd.merge(spotify_df, all_lyrics, on="song", how="left")

merged_df.drop_duplicates(subset=["song", "artist_name"], keep="first", inplace=True)

for col in merged_df.columns:
    if col.endswith("_x"):
        base_col = col[:-2]  # Remove '_x' suffix
        if base_col + "_y" in merged_df.columns:
            merged_df[base_col] = merged_df[col]  
            merged_df.drop(columns=[col, base_col + "_y"], inplace=True)

if "popularity" in merged_df.columns:
    print("✅ 'popularity' column exists!")
else:
    print("❌ 'popularity' column is missing!")

sia = SentimentIntensityAnalyzer()

def analyze_sentiment(lyrics):
    if pd.isna(lyrics) or lyrics.strip() == "":
        return pd.Series([0, 0, 0, 0, 0, 0, 1])  
    
    sentiment_scores = sia.polarity_scores(lyrics)
    compound = sentiment_scores["compound"]
    pos = sentiment_scores["pos"]
    neu = sentiment_scores["neu"]
    neg = sentiment_scores["neg"]

    textblob_analysis = TextBlob(lyrics)
    sentiment_polarity = textblob_analysis.sentiment.polarity  
    subjectivity = textblob_analysis.sentiment.subjectivity  

    not_positive = 1 if pos < 0.2 else 0  

    return pd.Series([neg, neu, pos, compound, sentiment_polarity, subjectivity, not_positive])

merged_df[['Negative', 'Neutral', 'Positive', 'Compound', 'TextBlob_Polarity', 'TextBlob_Subjectivity', 'Not_Positive']] = merged_df['text'].apply(analyze_sentiment)

numeric_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
z_scores = stats.zscore(merged_df[numeric_columns])
outliers = (np.abs(z_scores) > 3).any(axis=1)  
df_cleaned = merged_df[~outliers]  

def categorize_popularity(value):
    if value <= 30:
        return 0  # Low Popularity
    elif value <= 70:
        return 1  # Medium Popularity
    else:
        return 2  # High Popularity
        
df_cleaned = df_cleaned.copy()  
df_cleaned["Popularity_Label"] = df_cleaned["popularity"].apply(categorize_popularity)

selected_features = [
    "duration_ms", "popularity", "danceability", "energy", "key", "loudness",
    "Compound", "TextBlob_Polarity", "TextBlob_Subjectivity", 
    "Negative", "Neutral", "Positive", "Not_Positive"
]

df_cleaned = df_cleaned.dropna(subset=selected_features)

df_cleaned.to_csv("cleaned_combine.csv", index=False)
print(f"✅ Final cleaned dataset saved as 'cleaned_combine.csv'.")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sanro/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ 'popularity' column exists!
✅ Final cleaned dataset saved as 'cleaned_combine.csv'.


Code to see the Dataset 

In [19]:
df = pd.read_csv("cleaned_combine.csv")
print(df.head()) 

  artist_name                track_id  \
0          YG  2RM4jf1Xa9zPgMGRDiht8O   
1          YG  1tHDG53xJNGsItRA3vfVgs   
2       R3HAB  6Wosx2euFPMT14UXiWudMy   
3  Chris Cooq  3J2Jpw61sO7l6Hc7qdYV91   
4  Chris Cooq  2jbYvQCyPgX3CdmAzeVeuS   

                                             song  time_signature artist link  \
0  big bank feat. 2 chainz, big sean, nicki minaj               4    NaN  NaN   
1                    band drum (feat. a$ap rocky)               4    NaN  NaN   
2                                   radio silence               4    NaN  NaN   
3                                         lactose               4    NaN  NaN   
4                             same - original mix               4    NaN  NaN   

  text explicit  year genre  ... valence  popularity  Negative  Neutral  \
0  NaN      NaN   NaN   NaN  ...   0.118          44       0.0      0.0   
1  NaN      NaN   NaN   NaN  ...   0.371          10       0.0      0.0   
2  NaN      NaN   NaN   NaN  ...   0.382 

## Code to Split the Data set into Training, Testing and Validation and also applying Smote 

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

file_path = "cleaned_combine.csv" 
df = pd.read_csv(file_path)

selected_features = [
    'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness',
    'Compound', 'TextBlob_Polarity', 'TextBlob_Subjectivity', 
    'Negative', 'Neutral', 'Positive', 'Not_Positive'
]

df_cleaned = df.dropna(subset=selected_features)

# Convert Popularity into categories (Low, Medium, High)
def categorize_popularity(value):
    if value <= 30:
        return 0  # Low Popularity
    elif value <= 70:
        return 1  # Medium Popularity
    else:
        return 2  # High Popularity

df_cleaned["Popularity_Label"] = df_cleaned["popularity"].apply(categorize_popularity)

X = df_cleaned[selected_features].drop(columns=["popularity"], errors="ignore")
y = df_cleaned["Popularity_Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training Class Distribution:\n", y_train.value_counts())
print("\nTesting Class Distribution:\n", y_test.value_counts())

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv("training_data.csv", index=False)
test_data.to_csv("testing_data.csv", index=False)

print("\nData Split Completed! Training and Testing files saved.")

train_file_path = "training_data.csv"
train_df = pd.read_csv(train_file_path)

X_train_full = train_df.drop(columns=["Popularity_Label"], errors="ignore")
y_train_full = train_df["Popularity_Label"]

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print("Training Class Distribution (after second split):\n", y_train.value_counts())
print("\nValidation Class Distribution:\n", y_val.value_counts())

train_data_final = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

train_data_final.to_csv("final_training_data.csv", index=False)
val_data.to_csv("validation_data.csv", index=False)

print("\n Second Data Split Completed! Final Training and Validation files saved.")

train_file = "final_training_data.csv"
val_file = "validation_data.csv"
test_file = "testing_data.csv"

train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

X_train = train_df.drop(columns=["Popularity_Label"], errors="ignore")
y_train = train_df["Popularity_Label"]
X_val = val_df.drop(columns=["Popularity_Label"], errors="ignore")
y_val = val_df["Popularity_Label"]
X_test = test_df.drop(columns=["Popularity_Label"], errors="ignore")
y_test = test_df["Popularity_Label"]

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\n SMOTE Applied & Data Standardized.")

Training Class Distribution:
 Popularity_Label
0    55973
1    29964
2     1010
Name: count, dtype: int64

Testing Class Distribution:
 Popularity_Label
0    13994
1     7491
2      252
Name: count, dtype: int64

Data Split Completed! Training and Testing files saved.
Training Class Distribution (after second split):
 Popularity_Label
0    44778
1    23971
2      808
Name: count, dtype: int64

Validation Class Distribution:
 Popularity_Label
0    11195
1     5993
2      202
Name: count, dtype: int64

 Second Data Split Completed! Final Training and Validation files saved.

 SMOTE Applied & Data Standardized.
