In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [2]:
file_paths = [
    'mpd.slice.15000-15999.json',
]

all_data = []

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)
        for playlist in data['playlists']:
            playlist_name = playlist['name']
            for track in playlist['tracks']:
                all_data.append({
                    "playlist_name": playlist_name,
                    "position_in_playlist": track["pos"],
                    "track_id": track['track_uri'],
                    "track_name": track['track_name'],
                    "artist_name": track['artist_name'],
                    "duration_ms": track['duration_ms'],
                    "album_name": track['album_name']
                })
                # Add and remove for DBSCAN
                if len(all_data) >= 1000:
                    break
            if len(all_data) >= 1000:
                break
        if len(all_data) >= 1000:
            break
df = pd.DataFrame(all_data)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   playlist_name         1000 non-null   object
 1   position_in_playlist  1000 non-null   int64 
 2   track_id              1000 non-null   object
 3   track_name            1000 non-null   object
 4   artist_name           1000 non-null   object
 5   duration_ms           1000 non-null   int64 
 6   album_name            1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [4]:
track_counts = df.groupby(['track_id', 'track_name', 'artist_name']).size().reset_index(name='count')

min_count = track_counts['count'].min()
max_count = track_counts['count'].max()
track_counts['popularity_score'] = (
    ((track_counts['count'] - min_count) / (max_count - min_count) * 9 + 1
).astype(int))

df_with_popularity = df.merge(
    track_counts[['track_id', 'popularity_score']],
    on='track_id',
    how='left'
)

In [26]:
X = df_with_popularity[['duration_ms', 'popularity_score']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

eps = 0.7
min_samples = 100

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X_scaled)

df_with_popularity['cluster'] = dbscan.labels_

print(df_with_popularity.head())

n_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
n_noise = list(dbscan.labels_).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

print(df_with_popularity['cluster'].value_counts())

   playlist_name  position_in_playlist                              track_id  \
0  Wedding Music                     0  spotify:track:2rb4cO7RczQFSvpjTJ4C2P   
1  Wedding Music                     1  spotify:track:017nSBNU2XHwMV0NCWZCqg   
2  Wedding Music                     2  spotify:track:0W5TB5VNs0J16suh3r67P1   
3  Wedding Music                     3  spotify:track:1gBnG1MiTNBBVzmuwP7Wii   
4  Wedding Music                     4  spotify:track:0b99xsUKkETGwZGzpX987r   

                   track_name      artist_name  duration_ms  \
0              Always Forever     Phil Wickham       281880   
1              Divine Romance     Phil Wickham       298026   
2  Messiah / You're Beautiful     Phil Wickham       293720   
3         Love Is Not A Fight  Warren Barfield       222986   
4             When I Say I Do     Matthew West       246000   

           album_name  popularity_score  cluster  
0        Phil Wickham                 1        0  
1        Phil Wickham                 

In [39]:
features = ['duration_ms', 'popularity_score']
X = df_with_popularity[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

best_score = -1
best_params = {'eps': None, 'min_samples': None}

for eps in np.arange(0.1, 1.1, 0.1):  
    for min_samples in range(1, 101):  
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_scaled)
        
        if len(np.unique(labels)) > 1:  
            score = silhouette_score(X_scaled, labels)
            print(f"eps={eps:.1f}, min_samples={min_samples}, Silhouette Score={score:.3f}")

            if score > best_score:
                best_score = score
                best_params['eps'] = eps
                best_params['min_samples'] = min_samples
        else:
            print(f"eps={eps:.1f}, min_samples={min_samples}, No valid clusters found.")

print("\nBest Parameters:")
print(f"eps={best_params['eps']}, min_samples={best_params['min_samples']}, Best Silhouette Score={best_score:.3f}")

final_dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
df['cluster'] = final_dbscan.fit_predict(X_scaled)

print("\nCluster Distribution:")
print(df['cluster'].value_counts())


eps=0.1, min_samples=1, Silhouette Score=0.431
eps=0.1, min_samples=2, Silhouette Score=0.427
eps=0.1, min_samples=3, Silhouette Score=0.399
eps=0.1, min_samples=4, Silhouette Score=0.368
eps=0.1, min_samples=5, Silhouette Score=0.436
eps=0.1, min_samples=6, Silhouette Score=0.513
eps=0.1, min_samples=7, Silhouette Score=0.500
eps=0.1, min_samples=8, Silhouette Score=0.501
eps=0.1, min_samples=9, Silhouette Score=0.498
eps=0.1, min_samples=10, Silhouette Score=0.455
eps=0.1, min_samples=11, Silhouette Score=0.429
eps=0.1, min_samples=12, Silhouette Score=0.429
eps=0.1, min_samples=13, Silhouette Score=0.426
eps=0.1, min_samples=14, Silhouette Score=0.414
eps=0.1, min_samples=15, Silhouette Score=0.449
eps=0.1, min_samples=16, Silhouette Score=0.449
eps=0.1, min_samples=17, Silhouette Score=0.449
eps=0.1, min_samples=18, Silhouette Score=0.362
eps=0.1, min_samples=19, Silhouette Score=0.341
eps=0.1, min_samples=20, Silhouette Score=0.343
eps=0.1, min_samples=21, Silhouette Score=0.605
e

eps=0.2, min_samples=72, Silhouette Score=0.563
eps=0.2, min_samples=73, Silhouette Score=0.561
eps=0.2, min_samples=74, Silhouette Score=0.561
eps=0.2, min_samples=75, Silhouette Score=0.561
eps=0.2, min_samples=76, Silhouette Score=0.561
eps=0.2, min_samples=77, Silhouette Score=0.561
eps=0.2, min_samples=78, Silhouette Score=0.557
eps=0.2, min_samples=79, Silhouette Score=0.557
eps=0.2, min_samples=80, Silhouette Score=0.554
eps=0.2, min_samples=81, Silhouette Score=0.551
eps=0.2, min_samples=82, Silhouette Score=0.551
eps=0.2, min_samples=83, Silhouette Score=0.549
eps=0.2, min_samples=84, Silhouette Score=0.545
eps=0.2, min_samples=85, Silhouette Score=0.545
eps=0.2, min_samples=86, Silhouette Score=0.543
eps=0.2, min_samples=87, Silhouette Score=0.530
eps=0.2, min_samples=88, Silhouette Score=0.530
eps=0.2, min_samples=89, Silhouette Score=0.530
eps=0.2, min_samples=90, Silhouette Score=0.528
eps=0.2, min_samples=91, Silhouette Score=0.521
eps=0.2, min_samples=92, Silhouette Scor

eps=0.4, min_samples=50, Silhouette Score=0.715
eps=0.4, min_samples=51, Silhouette Score=0.715
eps=0.4, min_samples=52, Silhouette Score=0.715
eps=0.4, min_samples=53, Silhouette Score=0.715
eps=0.4, min_samples=54, Silhouette Score=0.715
eps=0.4, min_samples=55, Silhouette Score=0.715
eps=0.4, min_samples=56, Silhouette Score=0.715
eps=0.4, min_samples=57, Silhouette Score=0.715
eps=0.4, min_samples=58, Silhouette Score=0.715
eps=0.4, min_samples=59, Silhouette Score=0.715
eps=0.4, min_samples=60, Silhouette Score=0.710
eps=0.4, min_samples=61, Silhouette Score=0.710
eps=0.4, min_samples=62, Silhouette Score=0.707
eps=0.4, min_samples=63, Silhouette Score=0.705
eps=0.4, min_samples=64, Silhouette Score=0.705
eps=0.4, min_samples=65, Silhouette Score=0.705
eps=0.4, min_samples=66, Silhouette Score=0.705
eps=0.4, min_samples=67, Silhouette Score=0.705
eps=0.4, min_samples=68, Silhouette Score=0.705
eps=0.4, min_samples=69, Silhouette Score=0.705
eps=0.4, min_samples=70, Silhouette Scor

eps=0.6, min_samples=26, Silhouette Score=0.760
eps=0.6, min_samples=27, Silhouette Score=0.760
eps=0.6, min_samples=28, Silhouette Score=0.760
eps=0.6, min_samples=29, Silhouette Score=0.760
eps=0.6, min_samples=30, Silhouette Score=0.760
eps=0.6, min_samples=31, Silhouette Score=0.755
eps=0.6, min_samples=32, Silhouette Score=0.755
eps=0.6, min_samples=33, Silhouette Score=0.755
eps=0.6, min_samples=34, Silhouette Score=0.755
eps=0.6, min_samples=35, Silhouette Score=0.755
eps=0.6, min_samples=36, Silhouette Score=0.755
eps=0.6, min_samples=37, Silhouette Score=0.751
eps=0.6, min_samples=38, Silhouette Score=0.751
eps=0.6, min_samples=39, Silhouette Score=0.764
eps=0.6, min_samples=40, Silhouette Score=0.764
eps=0.6, min_samples=41, Silhouette Score=0.764
eps=0.6, min_samples=42, Silhouette Score=0.761
eps=0.6, min_samples=43, Silhouette Score=0.761
eps=0.6, min_samples=44, Silhouette Score=0.761
eps=0.6, min_samples=45, Silhouette Score=0.761
eps=0.6, min_samples=46, Silhouette Scor

eps=0.7, min_samples=99, Silhouette Score=0.757
eps=0.7, min_samples=100, Silhouette Score=0.757
eps=0.8, min_samples=1, Silhouette Score=0.765
eps=0.8, min_samples=2, Silhouette Score=0.765
eps=0.8, min_samples=3, Silhouette Score=0.774
eps=0.8, min_samples=4, Silhouette Score=0.774
eps=0.8, min_samples=5, Silhouette Score=0.774
eps=0.8, min_samples=6, Silhouette Score=0.774
eps=0.8, min_samples=7, Silhouette Score=0.774
eps=0.8, min_samples=8, Silhouette Score=0.774
eps=0.8, min_samples=9, Silhouette Score=0.774
eps=0.8, min_samples=10, Silhouette Score=0.772
eps=0.8, min_samples=11, Silhouette Score=0.772
eps=0.8, min_samples=12, Silhouette Score=0.771
eps=0.8, min_samples=13, Silhouette Score=0.771
eps=0.8, min_samples=14, Silhouette Score=0.771
eps=0.8, min_samples=15, Silhouette Score=0.771
eps=0.8, min_samples=16, Silhouette Score=0.771
eps=0.8, min_samples=17, Silhouette Score=0.770
eps=0.8, min_samples=18, Silhouette Score=0.770
eps=0.8, min_samples=19, Silhouette Score=0.769


eps=0.9, min_samples=75, Silhouette Score=0.771
eps=0.9, min_samples=76, Silhouette Score=0.771
eps=0.9, min_samples=77, Silhouette Score=0.767
eps=0.9, min_samples=78, Silhouette Score=0.767
eps=0.9, min_samples=79, Silhouette Score=0.767
eps=0.9, min_samples=80, Silhouette Score=0.767
eps=0.9, min_samples=81, Silhouette Score=0.767
eps=0.9, min_samples=82, Silhouette Score=0.767
eps=0.9, min_samples=83, Silhouette Score=0.767
eps=0.9, min_samples=84, Silhouette Score=0.767
eps=0.9, min_samples=85, Silhouette Score=0.767
eps=0.9, min_samples=86, Silhouette Score=0.767
eps=0.9, min_samples=87, Silhouette Score=0.767
eps=0.9, min_samples=88, Silhouette Score=0.767
eps=0.9, min_samples=89, Silhouette Score=0.767
eps=0.9, min_samples=90, Silhouette Score=0.767
eps=0.9, min_samples=91, Silhouette Score=0.767
eps=0.9, min_samples=92, Silhouette Score=0.767
eps=0.9, min_samples=93, Silhouette Score=0.767
eps=0.9, min_samples=94, Silhouette Score=0.767
eps=0.9, min_samples=95, Silhouette Scor

In [40]:
data.keys()

dict_keys(['info', 'playlists'])