In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('tracks_features.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204025 non-null  object 
 2   album             1204025 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode              1204025 non-null  int64  
 14  speechiness       1204025 non-null  float64
 15  acousticness      1204025 non-null  float64
 16  

In [14]:
track_counts = df.groupby(['id', 'name', 'artists']).size().reset_index(name='count')

min_count = track_counts['count'].min()
max_count = track_counts['count'].max()
track_counts['popularity_score'] = (
    ((track_counts['count'] - min_count) / (max_count - min_count) * 9 + 1
).astype(int))

df_with_popularity = df.merge(
    track_counts[['id', 'popularity_score']],
    on='id',
    how='left'
)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [5]:
df_first_1000 = df.head(1000)

In [7]:
X = df_first_1000[['tempo', 'danceability']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

eps = 0.7
min_samples = 100

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X_scaled)

df_first_1000['cluster'] = dbscan.labels_

print(df_first_1000.head())

n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
n_noise = list(dbscan.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

print(df_first_1000['cluster'].value_counts())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first_1000['cluster'] = dbscan.labels_


In [8]:
features = ['tempo', 'danceability']
X = df_first_1000[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

best_score = -1
best_params = {'eps': None, 'min_samples': None}

for eps in np.arange(0.1, 1.1, 0.1):  
    for min_samples in range(1, 101):  
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_scaled)
        
        if len(np.unique(labels)) > 1:  
            score = silhouette_score(X_scaled, labels)
            print(f"eps={eps:.1f}, min_samples={min_samples}, Silhouette Score={score:.3f}")

            if score > best_score:
                best_score = score
                best_params['eps'] = eps
                best_params['min_samples'] = min_samples
        else:
            print(f"eps={eps:.1f}, min_samples={min_samples}, No valid clusters found.")

print("\nBest Parameters:")
print(f"eps={best_params['eps']}, min_samples={best_params['min_samples']}, Best Silhouette Score={best_score:.3f}")

final_dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
df['cluster'] = final_dbscan.fit_predict(X_scaled)

print("\nCluster Distribution:")
print(df['cluster'].value_counts())


eps=0.1, min_samples=1, Silhouette Score=-0.094
eps=0.1, min_samples=2, Silhouette Score=-0.125
eps=0.1, min_samples=3, Silhouette Score=-0.184
eps=0.1, min_samples=4, Silhouette Score=-0.248
eps=0.1, min_samples=5, Silhouette Score=-0.243
eps=0.1, min_samples=6, Silhouette Score=-0.310
eps=0.1, min_samples=7, Silhouette Score=-0.386
eps=0.1, min_samples=8, Silhouette Score=-0.450
eps=0.1, min_samples=9, Silhouette Score=-0.458
eps=0.1, min_samples=10, Silhouette Score=-0.434
eps=0.1, min_samples=11, No valid clusters found.
eps=0.1, min_samples=12, No valid clusters found.
eps=0.1, min_samples=13, No valid clusters found.
eps=0.1, min_samples=14, No valid clusters found.
eps=0.1, min_samples=15, No valid clusters found.
eps=0.1, min_samples=16, No valid clusters found.
eps=0.1, min_samples=17, No valid clusters found.
eps=0.1, min_samples=18, No valid clusters found.
eps=0.1, min_samples=19, No valid clusters found.
eps=0.1, min_samples=20, No valid clusters found.
eps=0.1, min_sample

eps=0.3, min_samples=2, Silhouette Score=-0.104
eps=0.3, min_samples=3, Silhouette Score=0.055
eps=0.3, min_samples=4, Silhouette Score=0.081
eps=0.3, min_samples=5, Silhouette Score=0.077
eps=0.3, min_samples=6, Silhouette Score=0.296
eps=0.3, min_samples=7, Silhouette Score=0.382
eps=0.3, min_samples=8, Silhouette Score=0.382
eps=0.3, min_samples=9, Silhouette Score=0.153
eps=0.3, min_samples=10, Silhouette Score=0.354
eps=0.3, min_samples=11, Silhouette Score=0.153
eps=0.3, min_samples=12, Silhouette Score=0.335
eps=0.3, min_samples=13, Silhouette Score=0.333
eps=0.3, min_samples=14, Silhouette Score=0.329
eps=0.3, min_samples=15, Silhouette Score=0.202
eps=0.3, min_samples=16, Silhouette Score=0.317
eps=0.3, min_samples=17, Silhouette Score=0.316
eps=0.3, min_samples=18, Silhouette Score=0.144
eps=0.3, min_samples=19, Silhouette Score=0.308
eps=0.3, min_samples=20, Silhouette Score=0.303
eps=0.3, min_samples=21, Silhouette Score=0.112
eps=0.3, min_samples=22, Silhouette Score=0.123

eps=0.4, min_samples=77, Silhouette Score=-0.000
eps=0.4, min_samples=78, Silhouette Score=-0.040
eps=0.4, min_samples=79, Silhouette Score=-0.091
eps=0.4, min_samples=80, Silhouette Score=-0.091
eps=0.4, min_samples=81, Silhouette Score=-0.106
eps=0.4, min_samples=82, Silhouette Score=-0.113
eps=0.4, min_samples=83, Silhouette Score=-0.145
eps=0.4, min_samples=84, Silhouette Score=-0.151
eps=0.4, min_samples=85, Silhouette Score=-0.151
eps=0.4, min_samples=86, Silhouette Score=-0.154
eps=0.4, min_samples=87, Silhouette Score=-0.182
eps=0.4, min_samples=88, Silhouette Score=-0.189
eps=0.4, min_samples=89, No valid clusters found.
eps=0.4, min_samples=90, No valid clusters found.
eps=0.4, min_samples=91, No valid clusters found.
eps=0.4, min_samples=92, No valid clusters found.
eps=0.4, min_samples=93, No valid clusters found.
eps=0.4, min_samples=94, No valid clusters found.
eps=0.4, min_samples=95, No valid clusters found.
eps=0.4, min_samples=96, No valid clusters found.
eps=0.4, min

eps=0.6, min_samples=52, Silhouette Score=0.384
eps=0.6, min_samples=53, Silhouette Score=0.384
eps=0.6, min_samples=54, Silhouette Score=0.384
eps=0.6, min_samples=55, Silhouette Score=0.377
eps=0.6, min_samples=56, Silhouette Score=0.376
eps=0.6, min_samples=57, Silhouette Score=0.372
eps=0.6, min_samples=58, Silhouette Score=0.372
eps=0.6, min_samples=59, Silhouette Score=0.371
eps=0.6, min_samples=60, Silhouette Score=0.371
eps=0.6, min_samples=61, Silhouette Score=0.370
eps=0.6, min_samples=62, Silhouette Score=0.371
eps=0.6, min_samples=63, Silhouette Score=0.369
eps=0.6, min_samples=64, Silhouette Score=0.369
eps=0.6, min_samples=65, Silhouette Score=0.368
eps=0.6, min_samples=66, Silhouette Score=0.368
eps=0.6, min_samples=67, Silhouette Score=0.368
eps=0.6, min_samples=68, Silhouette Score=0.366
eps=0.6, min_samples=69, Silhouette Score=0.366
eps=0.6, min_samples=70, Silhouette Score=0.365
eps=0.6, min_samples=71, Silhouette Score=0.363
eps=0.6, min_samples=72, Silhouette Scor

eps=0.8, min_samples=30, Silhouette Score=0.435
eps=0.8, min_samples=31, Silhouette Score=0.435
eps=0.8, min_samples=32, Silhouette Score=0.435
eps=0.8, min_samples=33, Silhouette Score=0.452
eps=0.8, min_samples=34, Silhouette Score=0.452
eps=0.8, min_samples=35, Silhouette Score=0.452
eps=0.8, min_samples=36, Silhouette Score=0.452
eps=0.8, min_samples=37, Silhouette Score=0.452
eps=0.8, min_samples=38, Silhouette Score=0.450
eps=0.8, min_samples=39, Silhouette Score=0.450
eps=0.8, min_samples=40, Silhouette Score=0.450
eps=0.8, min_samples=41, Silhouette Score=0.451
eps=0.8, min_samples=42, Silhouette Score=0.451
eps=0.8, min_samples=43, Silhouette Score=0.452
eps=0.8, min_samples=44, Silhouette Score=0.452
eps=0.8, min_samples=45, Silhouette Score=0.452
eps=0.8, min_samples=46, Silhouette Score=0.452
eps=0.8, min_samples=47, Silhouette Score=0.452
eps=0.8, min_samples=48, Silhouette Score=0.452
eps=0.8, min_samples=49, Silhouette Score=0.452
eps=0.8, min_samples=50, Silhouette Scor

eps=1.0, min_samples=13, No valid clusters found.
eps=1.0, min_samples=14, No valid clusters found.
eps=1.0, min_samples=15, No valid clusters found.
eps=1.0, min_samples=16, No valid clusters found.
eps=1.0, min_samples=17, No valid clusters found.
eps=1.0, min_samples=18, No valid clusters found.
eps=1.0, min_samples=19, No valid clusters found.
eps=1.0, min_samples=20, No valid clusters found.
eps=1.0, min_samples=21, No valid clusters found.
eps=1.0, min_samples=22, No valid clusters found.
eps=1.0, min_samples=23, No valid clusters found.
eps=1.0, min_samples=24, No valid clusters found.
eps=1.0, min_samples=25, No valid clusters found.
eps=1.0, min_samples=26, No valid clusters found.
eps=1.0, min_samples=27, No valid clusters found.
eps=1.0, min_samples=28, No valid clusters found.
eps=1.0, min_samples=29, No valid clusters found.
eps=1.0, min_samples=30, No valid clusters found.
eps=1.0, min_samples=31, No valid clusters found.
eps=1.0, min_samples=32, No valid clusters found.


ValueError: Length of values (1000) does not match length of index (1204025)

In [10]:
df2 = pd.read_csv("tracks.csv")
df2
jsonFile = json.load(open("vetting_playlist.json"))
print(jsonFile.keys())
tracks_data = [item['track'] for item in jsonFile['tracks']['items']]
artists_data = [artist['name'] for item in tracks_data for artist in item['artists']]
ss = dataF
filtered_df = df2[df2['name'].isin(ss['name'])]
fineldata = filtered_df[filtered_df['popularity'].isin(ss["popularity"])]
fineldata
sns.scatterplot(data=fineldata, x="tempo",y= "danceability")
from sklearn.cluster import DBSCAN
X = fineldata[["tempo", "danceability"]]
cluster = DBSCAN(eps=4, min_samples=5).fit(X)
sns.scatterplot(data=fineldata, x="tempo", y="danceability", hue=cluster.labels_)

dict_keys(['collaborative', 'description', 'external_urls', 'followers', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])


NameError: name 'dataF' is not defined