In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

df = pd.read_csv('spotify-2023.csv')

# Assuming 'df' is your DataFrame
# Handle missing values for columns
imputer = SimpleImputer(strategy='most_frequent')  # For categorical columns
df['key'] = imputer.fit_transform(df[['key']])  # Apply to the 'key' column

# Convert non-numeric columns to numeric values (e.g., 'streams' columns)
df['streams']
df['streams (int)'] = pd.to_numeric(df['streams'], errors='coerce')  # Handle 'streams' column

# Normalize the numeric columns
numeric_cols = ['artist_count', 'released_year', 'released_month', 'released_day', 'in_spotify_playlists',
                'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_charts',
                'in_shazam_charts', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
                'instrumentalness_%', 'liveness_%', 'speechiness_%', 'streams (int)']
df_numeric = df[numeric_cols]

# Normalize the numeric columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

ValueError: could not convert string to float: 'BPM110KeyAModeMajorDanceability53Valence75Energy69Acousticness7Instrumentalness0Liveness17Speechiness3'

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Determine the number of clusters using the elbow method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow method
plt.plot(range(1, 11), inertia)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

# Choose the optimal number of clusters, say k = 4
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(df_scaled)

# Visualize the clustering results (for 2 features, for simplicity)
sns.scatterplot(x=df['danceability_%'], y=df['energy_%'], hue=df['cluster'], palette='Set1')
plt.title('Clustering Results')
plt.show()


In [None]:
# Check the cluster centers and distribution
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=numeric_cols)
print(cluster_centers)

# Check how many items are in each cluster
cluster_distribution = df['cluster'].value_counts()
print(cluster_distribution)
