In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data.csv


In [4]:
import io

df = pd.read_csv(io.BytesIO(uploaded['data.csv']))
print(df.head())

   valence  year  acousticness  \
0   0.0594  1921         0.982   
1   0.9630  1921         0.732   
2   0.0394  1921         0.961   
3   0.1650  1921         0.967   
4   0.2530  1921         0.957   

                                             artists  danceability  \
0  ['Sergei Rachmaninoff', 'James Levine', 'Berli...         0.279   
1                                     ['Dennis Day']         0.819   
2  ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...         0.328   
3                                   ['Frank Parker']         0.275   
4                                     ['Phil Regan']         0.418   

   duration_ms  energy  explicit                      id  instrumentalness  \
0       831667   0.211         0  4BJqT0PrAfrxzMOxytFOIz          0.878000   
1       180533   0.341         0  7xPhfUan2yNtyFG0cUWkt8          0.000000   
2       500062   0.166         0  1o6I8BglA6ylDMrIELygv1          0.913000   
3       210000   0.309         0  3ftBPsC5vPBKxYSee08FDH      

Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
features = ['valence', 'acousticness', 'danceability', 'energy',
            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo']

df_features = df[features].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_features)

Train test split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

Silhouette Score (to find optimal K value)

In [None]:
from sklearn.cluster import KMeans

inertia = []
k_values = list(range(2, 16))

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    inertia.append(kmeans.inertia_)

# Function to find elbow point
def find_elbow(k_vals, inertias):
    # Convert to coordinates
    points = np.array(list(zip(k_vals, inertias)))

    # Line from first to last point
    p1 = points[0]
    p2 = points[-1]

    # Compute distances from each point to the line
    def distance(point):
        return np.abs(np.cross(p2 - p1, point - p1)) / np.linalg.norm(p2 - p1)

    distances = [distance(p) for p in points]
    optimal_k_index = np.argmax(distances)
    return k_vals[optimal_k_index]

# Find and print optimal k
optimal_k = find_elbow(k_values, inertia)
print(f"Optimal number of clusters (k): {optimal_k}")

Optimal number of clusters (k): 6


  return np.abs(np.cross(p2 - p1, point - p1)) / np.linalg.norm(p2 - p1)


Apply K Means

In [12]:
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors(n_neighbors=6)
nn_model.fit(X_scaled)

In [None]:
def get_song_recommendations(song_name, top_n=5):
    song_matches = df[df['name'].str.lower() == song_name.lower()]

    if song_matches.empty:
        return []

    df_clean = df.dropna(subset=features)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean[features])

    nn_model = NearestNeighbors(n_neighbors=top_n + 1)
    nn_model.fit(X_scaled)

    input_song = song_matches.iloc[0]
    input_vector = input_song[features].values.reshape(1, -1)
    input_scaled = scaler.transform(input_vector)

    distances, indices = nn_model.kneighbors(input_scaled, n_neighbors=top_n + 1)

    recommendations = df_clean.iloc[indices[0]]
    recommendations = recommendations[recommendations['name'].str.lower() != song_name.lower()]

    result_table = recommendations[['name', 'artists', 'year']].reset_index(drop=True)
    result_table.columns = ['name', 'artists', 'year']  # use lowercase for Jinja

    return result_table.to_dict(orient="records")


In [32]:
#Example usage
get_song_recommendations("Boom clap")



Unnamed: 0,SONG NAME,ARTISTS,POPULARITY
0,Written in the Stars (feat. Eric Turner),"['Tinie Tempah', 'Eric Turner']",55
1,Hi High,['LOONA'],63
2,Taking My Ball,['Eminem'],47
3,FRIENDS,"['Marshmello', 'Anne-Marie']",81
4,AYA,['MAMAMOO'],76


Checking accuracy

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_similarity(song_name, top_n=5):
    song_matches = df[df['name'].str.lower() == song_name.lower()]

    if song_matches.empty:
        print("Song not found.")
        return None

    input_song = song_matches.iloc[0]
    input_vector = scaler.transform([input_song[features].values])

    distances, indices = nn_model.kneighbors(input_vector, n_neighbors=top_n + 1)
    recommendations = df.iloc[indices[0]]
    recommendations = recommendations[recommendations['name'].str.lower() != song_name.lower()]

    rec_vectors = scaler.transform(recommendations[features])
    sim_scores = cosine_similarity(input_vector, rec_vectors)[0]

    avg_sim = np.mean(sim_scores)
    print(f"Average Cosine Similarity for Recommendations: {avg_sim:.4f}")
    return avg_sim



In [39]:
evaluate_similarity("What makes you beautiful")


Average Cosine Similarity for Recommendations: 0.9933




np.float64(0.9933275416015835)