In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

In [18]:
# Load the cleaned dataset
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/songs_cleaned.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,track_name,artist(s)_name,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",125,80,89,83,31,0,8,4
1,LALA,Myke Towers,92,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,138,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,170,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,144,65,23,80,14,63,11,6


In [19]:
#extract the relevant features
feature_cols = df.columns[2:10]

#create X
X = df[feature_cols].values

#normalize features so all are contributing equally
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_cols])

feature_cols, X_scaled.shape

(Index(['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
        'instrumentalness_%', 'liveness_%', 'speechiness_%'],
       dtype='object'),
 (953, 8))

In [20]:
#function for formatting user input
def format_input(input):
  input = " ".join(input.lower().split())
  return input

#function for getting user input song name
def get_song_name():
  name = input("What is the song name are you interested in? ")
  name = format_input(name)
  return name

#function for getting user input singer
def get_singer():
  singer = input("Who is the singer of the song you are interested in? ")
  singer = format_input(singer)
  return singer

In [21]:
#get the artist and track
song_artist = get_singer()
song_track = get_song_name()

#ensure audio_features is a list
audio_features = list(feature_cols)

#find song using the track and artist name
mask = df["track_name"].str.lower() == song_track.lower()

if song_artist is not None:
    mask &= df["artist(s)_name"].str.lower().str.contains(song_artist.lower())

song_df = df[mask]

if song_df.empty:
    raise ValueError("Song not found in data. Please check track name/artist.")

print(song_df)


Who is the singer of the song you are interested in? olivia rodrigo
What is the song name are you interested in? vampire
  track_name  artist(s)_name  bpm  danceability_%  valence_%  energy_%  \
2    vampire  Olivia Rodrigo  138              51         32        53   

   acousticness_%  instrumentalness_%  liveness_%  speechiness_%  
2              17                   0          31              6  


#Cosine Similarity

In [22]:
#extract row index of song in original dataset
song_index = song_df.index[0]

#extract the song (keeping vector shape)
song_vector = X_scaled[song_index : song_index+1]

#returns the cosine similarity between samples (compares song vs all songs including itself)
similarities = cosine_similarity(song_vector, X_scaled)[0]

#add new column to dataset
df_cosine = df.copy()
df_cosine["similarity"] = similarities

#exclude the song itself from results
recommendations_cosine = df_cosine[df_cosine.index != song_index].sort_values("similarity", ascending=False)

In [23]:
top5_cosine = recommendations_cosine[["track_name", "artist(s)_name", "similarity"]].head(5).reset_index(drop=True)

print("Top 5 Cosine Similarity Recommendations:")
display(top5_cosine)

Top 5 Cosine Similarity Recommendations:


Unnamed: 0,track_name,artist(s)_name,similarity
0,Call Out My Name,The Weeknd,0.941887
1,Hold My Hand,Lady Gaga,0.918998
2,Don't ever say love me (feat. RM of BTS),"RM, Colde",0.913331
3,All For Us - from the HBO Original Series Euph...,"Labrinth, Zendaya",0.880452
4,"Here We Goï¿½ï¿½ï¿½ Again (feat. Tyler, the Cr","The Weeknd, Tyler, The Creator",0.816988


#Euclidean Distance

In [24]:
#compute the distance matrix between each pair from the vector arrays
distances = euclidean_distances(song_vector, X_scaled)[0]

#create new column
df_euclidean = df.copy()
df_euclidean["distance"] = distances

#exclude the song from the results
recommendations_euclidean = df_euclidean[df_euclidean.index != song_index].sort_values("distance")

In [25]:
top5_euclidean = recommendations_euclidean[["track_name", "artist(s)_name", "distance"]].head(5).reset_index(drop=True)

print("Top 5 Euclidean Distance Recommendations:")
display(top5_euclidean)

Top 5 Euclidean Distance Recommendations:


Unnamed: 0,track_name,artist(s)_name,distance
0,Call Out My Name,The Weeknd,0.928466
1,Don't ever say love me (feat. RM of BTS),"RM, Colde",1.113209
2,ýýý98 Braves,Morgan Wallen,1.231885
3,Hold My Hand,Lady Gaga,1.244902
4,WORTH NOTHING,"Twisted, Oliver Tree",1.274878
