In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load the cleaned dataset
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/songs_cleaned.csv', encoding="latin1")
df.head()

Mounted at /content/drive


Unnamed: 0,track_name,artist(s)_name,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",125,80,89,83,31,0,8,4
1,LALA,Myke Towers,92,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,138,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,170,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,144,65,23,80,14,63,11,6


In [3]:
FEATURES = [
    "bpm",
    "danceability_%",
    "valence_%",
    "energy_%",
    "acousticness_%",
    "instrumentalness_%",
    "liveness_%",
    "speechiness_%"
]

# sanity check: all features must exist in the dataframe
missing = [c for c in FEATURES if c not in df.columns]
missing

[]

In [4]:
# Convert selected columns to numeric (coerce bad values to NaN)
df[FEATURES] = df[FEATURES].apply(pd.to_numeric, errors="coerce")

# Drop rows with missing values in any of the feature columns
df_clean = df.dropna(subset=FEATURES).reset_index(drop=True)

df_clean.shape

(953, 10)

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean[FEATURES])

X_scaled.shape

(953, 8)

In [6]:
K = 10  # number of neighbors to retrieve for recommendations

knn_model = NearestNeighbors(n_neighbors=K, metric="euclidean")
knn_model.fit(X_scaled)

In [9]:
def recommend(song_name: str, k: int = 10):
    # case-insensitive match on track_name
    matches = df_clean[df_clean["track_name"].str.lower() == song_name.lower()]

    if matches.empty:
        print(f"No song named '{song_name}' found in the dataset.")
        return None

    # use the first match if multiple rows share the same title
    song = matches.iloc[0]

    print(f"Query: {song['track_name']} — {song['artist(s)_name']}")

    # build and scale feature vector for the query song
    song_vec = pd.DataFrame([song[FEATURES]], columns=FEATURES)
    song_vec_scaled = scaler.transform(song_vec)

    # find nearest neighbors (include the song itself, then skip it)
    distances, indices = knn_model.kneighbors(song_vec_scaled, n_neighbors=k + 1)

    rec_indices = indices[0][1:]        # skip index 0 (the song itself)
    rec_distances = distances[0][1:]

    recs = df_clean.iloc[rec_indices].copy()
    recs["distance"] = rec_distances

    return recs[["track_name", "artist(s)_name", "distance"]].reset_index(drop=True)

In [10]:
recommend("vampire")

Query: vampire — Olivia Rodrigo


Unnamed: 0,track_name,artist(s)_name,distance
0,Call Out My Name,The Weeknd,0.928466
1,Don't ever say love me (feat. RM of BTS),"RM, Colde",1.113209
2,Ã½Ã½Ã½98 Braves,Morgan Wallen,1.231885
3,Hold My Hand,Lady Gaga,1.244902
4,WORTH NOTHING,"Twisted, Oliver Tree",1.274878
5,Romantic Homicide,d4vd,1.308152
6,Brividi,"Mahmood, Blanco",1.318985
7,Do They Know It's Christmas? - 1984 Version,Band Aid,1.38514
8,Miss You,Southstar,1.448817
9,Cigarettes,Juice WRLD,1.513538
