<a href="https://colab.research.google.com/github/TezBytes/music-recommender/blob/feat%2Fdata-cleaning-and-preprocessing/notebooks/01_eda_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-requisites

In [2]:
!pip install kagglehub[pandas-datasets]



# Load dataset

In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "maharshipandya/-spotify-tracks-dataset",
  file_path,
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:    Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210

In [4]:
print(df.shape)

(114000, 21)


In [5]:
print(df.head)

<bound method NDFrame.head of         Unnamed: 0                track_id                 artists  \
0                0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1                1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2                2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3                3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4                4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   
...            ...                     ...                     ...   
113995      113995  2C3TZjDRiAzdyViavDJ217           Rainy Lullaby   
113996      113996  1hIz5L4IB9hN3WRYPOCGPw           Rainy Lullaby   
113997      113997  6x8ZfSoqDjuNa5SVP5QjvX           Cesária Evora   
113998      113998  2e6sXL2bYv4bSz6VTdnfLs        Michael W. Smith   
113999      113999  2hETkH7cOfqmz3LqZDHZf5           Cesária Evora   

                                               album_name  \
0                                                  Comedy   
1      

# Drop Duplicates on track_id

In [6]:
df.drop_duplicates(subset=['track_id'], inplace=True)

In [7]:
print(f"After dropping duplicates: {df.shape}")

After dropping duplicates: (89741, 21)


# Remove Rows with Missing Elements

In [8]:
req_cols = ["track_name","danceability", "energy", "track_id" ]
df.dropna(subset=req_cols, inplace=True)

In [9]:
print(f"After dropping missing values: {df.shape}")

After dropping missing values: (89740, 21)


# Convert Data Types

In [10]:
df["explicit"] = df["explicit"].astype(bool)
df["duration_sec"] = df["duration_ms"] / 1000
df["duration_sec"] = df["duration_sec"].astype(int)

# Normalize Numerical Features

In [11]:
from sklearn.preprocessing import MinMaxScaler

num_cols = ["danceability", "energy", "duration_sec"]
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Save Cleaned Data

In [12]:
import os

if not os.path.exists("data"):
    os.makedirs("data")
    print("Directory 'data' created successfully.")
else:
    print("Directory 'data' already exists.")

Directory 'data' already exists.


In [20]:
df.to_csv("data/enriched_data.csv", index=False)

# Write to utils.py

In [14]:
import os

# Make sure the folder exists
os.makedirs("scripts", exist_ok=True)

# Create empty utils.py
with open("scripts/utils.py", "w") as f:
    f.write("")  # just creates the file

In [15]:
utils_code = """
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def load_dataset(path):
    df = pd.read_csv(path)
    return df

def drop_duplicates(df, subset_col="track_id"):
    return df.drop_duplicates(subset=subset_col)

def clean_missing_values(df, required_cols):
    return df.dropna(subset=required_cols)

def convert_types(df):
    df["explicit"] = df["explicit"].astype(bool)
    df["duration_sec"] = df["duration_ms"] / 1000
    return df.drop(columns=["duration_ms"])

def normalize_features(df, cols):
    scaler = MinMaxScaler()
    df[cols] = scaler.fit_transform(df[cols])
    return df
"""

with open("scripts/utils.py", "w") as f:
    f.write(utils_code)

# Use in notebook

In [21]:
import sys
sys.path.append("scripts")

import utils

df = utils.load_dataset("data/enriched_data.csv")

In [22]:
print(df.shape)

(89740, 22)
