In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans


In [2]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the dataset
with open('mpd.slice.0-999.json', 'r') as f:
    data = json.load(f)

In [4]:
# Extract text data from the dataset
texts = [playlist['name'] for playlist in data['playlists']]

In [5]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
X = vectorizer.fit_transform(texts)

In [6]:
# Perform Mini-Batch K-Means clustering
num_clusters = 5  # You can adjust this number as per your requirement
batch_size = 100  # You can adjust the batch size
kmeans = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, random_state=42)
kmeans.fit(X)

MiniBatchKMeans(batch_size=100, n_clusters=5, random_state=42)

In [7]:
# Print the top terms per cluster
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print(f"Cluster {i+1}:")
    for ind in order_centroids[i, :10]:
        print(f"   {terms[ind]}")


Top terms per cluster:
Cluster 1:
   songs
   rock
   oldies
   classic
   dance
   lit
   workout
   gym
   running
   favorite
Cluster 2:
   summer
   playlist
   new
   rap
   christmas
   good
   mix
   jams
   feels
   disney
Cluster 3:
   music
   work
   christmas
   studying
   christian
   dance
   wedding
   new
   electronic
   relaxing
Cluster 4:
   country
   old
   mix
   favorites
   summer
   2017
   classic
   day
   deep
   disney
Cluster 5:
   chill
   party
   hits
   slow
   gospel
   throwback
   roadtrip
   mixtape
   vibin
   sleep




In [8]:
# Assign cluster labels to each playlist
cluster_labels = kmeans.labels_

In [9]:
# Print the cluster labels for each playlist
for i, playlist in enumerate(data['playlists']):
    print(f"Playlist: {playlist['name']}, Cluster: {cluster_labels[i]}")

Playlist: Throwbacks, Cluster: 1
Playlist: Awesome Playlist, Cluster: 1
Playlist: korean , Cluster: 4
Playlist: mat, Cluster: 4
Playlist: 90s, Cluster: 4
Playlist: Wedding, Cluster: 1
Playlist: I Put A Spell On You, Cluster: 4
Playlist: 2017, Cluster: 4
Playlist: BOP, Cluster: 4
Playlist: old country , Cluster: 3
Playlist: abby , Cluster: 4
Playlist: VIBE, Cluster: 4
Playlist: relax, Cluster: 4
Playlist: sleep, Cluster: 4
Playlist: 90's , Cluster: 4
Playlist: New Songs, Cluster: 0
Playlist: slow hands, Cluster: 4
Playlist: Mom's playlist, Cluster: 1
Playlist: SARAH, Cluster: 4
Playlist: melancholy, Cluster: 4
Playlist: mixtape, Cluster: 4
Playlist: Sad Songs, Cluster: 0
Playlist: fall '17, Cluster: 1
Playlist: ✔️, Cluster: 4
Playlist: Twenty one pilots, Cluster: 4
Playlist: run it, Cluster: 1
Playlist: Winter 2014, Cluster: 4
Playlist: smooth , Cluster: 4
Playlist: Yeet, Cluster: 4
Playlist: groovy, Cluster: 4
Playlist: Garage Rock, Cluster: 0
Playlist: Running 2.0, Cluster: 0
Playlist