In [1]:
import time
import pandas as pd
import numpy as np

from utils.pca_from_scratch import pca

# Load and preprocess data

In [2]:
df = pd.read_csv('Data/Processed/spotify_clean.csv')

In [3]:
features = ['danceability', 'energy', 'valence', 'tempo', 'loudness', 'popularity']

In [4]:
X = df[features].values.astype(float)

In [5]:
def standardize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std, mean, std

# PCA

In [6]:
ratios = []

for k in range(1, 7):
    X_pca, eigenvalues, eigenvectors = pca(X, n_components=k)

    _, all_eigvals, _ = pca(X, n_components=X.shape[1])
    total_var = np.sum(all_eigvals)

    explained_var = np.sum(eigenvalues)
    ratio = explained_var / total_var

    ratios.append(ratio)

df_ratio = pd.DataFrame({
    'n_components': range(1, 7),
    'explained_variance_ratio': ratios
})

print(df_ratio)

   n_components  explained_variance_ratio
0             1                  0.449603
1             2                  0.623584
2             3                  0.783213
3             4                  0.905889
4             5                  0.973267
5             6                  1.000000


In [7]:
X_pca, eigenvalues, eigenvectors = pca(X, n_components=4)

In [8]:
for i in range(4):
    df[f'PC{i+1}'] = [row[i] for row in X_pca]

In [9]:
X_pca_norm, _, _ = standardize(df[['PC1', 'PC2', 'PC3', 'PC4']].values)

# Build recommendation system

In [10]:
def recommend_similar(song_index, top_k=5):
    target = X_pca_norm[song_index]
    norms = np.linalg.norm(X_pca_norm, axis=1)
    similarities = np.dot(X_pca_norm, target) / (norms * np.linalg.norm(target))

    indices = np.argsort(-similarities)
    indices = [i for i in indices if i != song_index]

    seen = set()
    unique_indices = []
    for i in indices:
        key = (df.loc[i, 'track_name'], df.loc[i, 'artist'])
        if key not in seen:
            seen.add(key)
            unique_indices.append(i)
        if len(unique_indices) >= top_k:
            break

    cols = ['track_name', 'artist', 'genre']
    return df.iloc[unique_indices][cols]

In [11]:
idx = 1000
print("Gợi ý cho bài hát:")
print(f"→ {df.loc[idx, 'track_name']} - {df.loc[idx, 'artist']}\n")

print("Các bài hát tương tự:")
print(recommend_similar(idx, top_k=5))

Gợi ý cho bài hát:
→ Can't Stay Mad - Danielle Bradbery

Các bài hát tương tự:
                     track_name       artist  genre
4834    Because He Lives (Amen)   Matt Maher      1
103381             Pure Morning      Placebo      5
101959                 It's On!         Korn      5
78595              Stomp Me Out    Bryce Fox      5
24131              Another Life  Yellow Claw     10


# Evaluate

In [12]:
def evaluate_performance(n_tests=100, top_k=5):
    indices = np.random.choice(len(X_pca_norm), n_tests, replace=False)
    start_time = time.time()
    for i in indices:
        recommend_similar(i, top_k=top_k)
    end_time = time.time()
    avg_time = (end_time - start_time) / n_tests
    print(f"Trung bình mỗi lần gợi ý mất: {avg_time:.4f} giây")

In [13]:
evaluate_performance(n_tests=100, top_k=5)

Trung bình mỗi lần gợi ý mất: 0.0488 giây


In [14]:
def evaluate_similarity_quality(n_tests=100, top_k=5):
    indices = np.random.choice(len(X_pca_norm), n_tests, replace=False)
    total_avg_sim = 0
    for i in indices:
        target = X_pca_norm[i]
        norms = np.linalg.norm(X_pca_norm, axis=1)
        dot_products = np.dot(X_pca_norm, target)
        target_norm = np.linalg.norm(target)
        similarities = dot_products / (norms * target_norm)
        similarities[i] = -1  
        top_sims = np.sort(similarities)[-top_k:]
        total_avg_sim += np.mean(top_sims)
    print(f"Trung bình độ tương đồng cosine top-{top_k}: {total_avg_sim / n_tests:.4f}")

In [15]:
evaluate_similarity_quality(n_tests=100, top_k=5)

Trung bình độ tương đồng cosine top-5: 0.9994
