In [5]:
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np

In [8]:
df_normalized = pd.read_csv("../../data/cleaned_data.csv")

# Select audio features

In [6]:
features = ['danceability_%', 'energy_%', 'valence_%', 
           'acousticness_%', 'speechiness_%', 'liveness_%', 
           'instrumentalness_%', 'bpm']


# Extract from normalized data

In [7]:
X = df_normalized[features].values

# Apply t-SNE

In [8]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, 
           n_iter=1000, verbose=1)
X_tsne = tsne.fit_transform(X)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 953 samples in 0.011s...
[t-SNE] Computed neighbors for 953 samples in 0.060s...
[t-SNE] Computed conditional probabilities for sample 953 / 953
[t-SNE] Mean sigma: 15.225412
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.997009
[t-SNE] KL divergence after 1000 iterations: 1.132855


# Visualization dataset

In [9]:
df_viz = pd.DataFrame(X_tsne, columns=['tsne_1', 'tsne_2'])
df_viz = pd.concat([df_viz, df_normalized.reset_index(drop=True)], axis=1)

In [10]:
# 5. Save for dashboard
df_viz.to_csv('data_with_tsne.csv', index=False)
df_viz.to_json('visualization_data.json', orient='records', indent=2)

print(f"✅ Done! Created dataset with {len(df_viz)} songs")
print("Files: data_with_tsne.csv, visualization_data.json")

✅ Done! Created dataset with 953 songs
Files: data_with_tsne.csv, visualization_data.json
