In [36]:
# Import required libraries
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
import holoviews as hv
from holoviews import dim, opts

## Uploading Dataframe

In [37]:
# Load the data
df_spotify = pd.read_csv("../Spotify/Spotify_data.csv")

# Shift index from 0-based to 1-based
df_spotify.index = df_spotify.index + 1
df_spotify.index.name = "spotify_user"
df_spotify.head()

Unnamed: 0_level_0,Age,Gender,spotify_usage_in_years,spotify_listening_device,spotify_subscription_plan,premium_sub_willingness,preffered_premium_plan,preferred_content,fav_genre,music_time,music_Influencial_mood,music_lis_frequency,music_expl_method,music_recc_rating,pod_lis_frequency,fav_pod_genre,preffered_pod_format,pod_host_preference,preffered_pod_duration,pod_variety_satisfaction
spotify_user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,20-35,Female,2+,Smart speakers or voice assistants,Free,Yes,Family Plan 179/month,Podcast,Melody,Night,Sadness or Melancholy,Leisure Time,Playlists,3,Daily,Comedy,Interview,Both,Both,Ok
2,12-20,Male,2+,Computer or laptop,Free,Yes,Individual Plan 119/month,Podcast,Rap,Afternoon,Social Gatherings or Parties,Workout Session,Playlists,2,Several times a week,Comedy,Interview,Both,,Satisfied
3,35-60,Other,0.5-1,Smart speakers or voice assistants,Free,Yes,Student Plan 59/month,Podcast,Pop,Night,Relaxation and Stress Relief,"Study Hours, While Traveling",Playlists,4,Once a week,Sports,Interview,,Both,Satisfied
4,20-35,Female,1+,"Smartphone, Smart speakers or voice assistants",Free,No,,Music,Melody,Night,"Relaxation and Stress Relief, Social Gathering...","Office Hours, Workout Session, Leisure Time","Recommendations, Playlists",4,Never,,,,,Ok
5,20-35,Female,1+,Smartphone,Free,No,,Music,Melody,Night,Relaxation and Stress Relief,Leisure Time,"Recommendations, Playlists",4,Rarely,Lifestyle and Health,Story telling,Well known individuals,Both,Ok


## Data Processing

In [38]:
# Data Preprocessing
# Create a binary column for subscription type (Free = 0, Premium = 1)
df_spotify['is_premium'] = df_spotify['spotify_subscription_plan'].apply(
    lambda x: 0 if x == 'Free' else 1
)

In [39]:
# Select relevant features for clustering
features = [
    'Age', 'Gender', 'spotify_usage_in_years', 'spotify_listening_device',
    'premium_sub_willingness', 'preffered_premium_plan', 'preferred_content',
    'fav_genre', 'music_time', 'music_Influencial_mood', 'music_lis_frequency',
    'music_expl_method', 'music_recc_rating', 'pod_lis_frequency', 'fav_pod_genre',
    'preffered_pod_format', 'pod_host_preference', 'preffered_pod_duration',
    'pod_variety_satisfaction'
]

In [40]:
# Create a copy for clustering
df_cluster = df_spotify[features].copy()

# Convert categorical variables to numerical using Label Encoding
label_encoders = {}
for column in df_cluster.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_cluster[column] = le.fit_transform(df_cluster[column].astype(str))
    label_encoders[column] = le

In [41]:
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_cluster)

# Determine optimal number of clusters using Elbow Method and Silhouette Score
inertia = []
silhouette_scores = []
k_values = range(2, 8)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)
    
    if k > 1:  # Silhouette score requires at least 2 clusters
        silhouette_scores.append(silhouette_score(scaled_data, kmeans.labels_))

In [42]:
# Plot Elbow Curve
elbow_plot = pd.DataFrame({'k': k_values, 'inertia': inertia}).hvplot.line(
    x='k', y='inertia', 
    title='Elbow Method for Optimal k',
    xticks=k_values,
    width=600, height=400
)

In [43]:
# Plot Silhouette Scores
silhouette_plot = pd.DataFrame({'k': k_values[1:], 'score': silhouette_scores}).hvplot.line(
    x='k', y='score',
    title='Silhouette Scores for Optimal k',
    xticks=k_values[1:],
    width=600, height=400
)

ValueError: All arrays must be of the same length

In [45]:
# Display the plots
(elbow_plot + silhouette_plot).cols(1)

NameError: name 'silhouette_plot' is not defined

In [None]:


# Based on the plots, select the optimal number of clusters
optimal_k = 3  # Adjust this based on your elbow and silhouette plots

# Perform K-means clustering with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster labels to the original dataframe
df_spotify['cluster'] = clusters

# Reduce dimensions for visualization using PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
df_pca['cluster'] = clusters
df_pca['subscription'] = df_spotify['spotify_subscription_plan']
df_pca['is_premium'] = df_spotify['is_premium']

# Create cluster visualization
cluster_plot = df_pca.hvplot.scatter(
    x='PC1', y='PC2', 
    by='cluster',
    title='Spotify User Clusters (PCA-reduced)',
    hover_cols=['subscription'],
    width=700, height=500,
    size=100, alpha=0.7
)

# Create visualization colored by subscription type
subscription_plot = df_pca.hvplot.scatter(
    x='PC1', y='PC2', 
    by='subscription',
    title='Spotify Users by Subscription Type (PCA-reduced)',
    width=700, height=500,
    size=100, alpha=0.7
)

# Display the plots
(cluster_plot + subscription_plot).cols(1)

# Analyze cluster characteristics by subscription type
cluster_summary = df_spotify.groupby(['cluster', 'spotify_subscription_plan']).size().unstack()
cluster_percentages = cluster_summary.div(cluster_summary.sum(axis=1), axis=0) * 100

# Plot cluster distribution by subscription type
cluster_dist_plot = cluster_percentages.hvplot.bar(
    title='Cluster Distribution by Subscription Type',
    ylabel='Percentage of Cluster',
    stacked=True,
    width=600, height=400,
    rot=45
)

# Compare premium vs free user characteristics within clusters
def compare_features(df, feature):
    return df.groupby(['cluster', 'spotify_subscription_plan'])[feature].value_counts(normalize=True).unstack().unstack()

# Example comparison for music listening frequency
music_freq_comparison = compare_features(df_spotify, 'music_lis_frequency')
music_freq_plot = music_freq_comparison.hvplot.bar(
    title='Music Listening Frequency by Cluster and Subscription',
    ylabel='Proportion',
    stacked=True,
    width=800, height=500,
    rot=45
)

# Example comparison for preferred content
content_comparison = compare_features(df_spotify, 'preferred_content')
content_plot = content_comparison.hvplot.bar(
    title='Preferred Content by Cluster and Subscription',
    ylabel='Proportion',
    stacked=True,
    width=800, height=500,
    rot=45
)

# Display comparison plots
(cluster_dist_plot + music_freq_plot + content_plot).cols(1)

# Statistical comparison of premium vs free users in each cluster
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Analysis:")
    cluster_data = df_spotify[df_spotify['cluster'] == cluster]
    
    # Compare premium vs free counts
    premium_count = cluster_data['is_premium'].sum()
    free_count = len(cluster_data) - premium_count
    print(f"Premium users: {premium_count} ({premium_count/len(cluster_data)*100:.1f}%)")
    print(f"Free users: {free_count} ({free_count/len(cluster_data)*100:.1f}%)")
    
    # Compare willingness to pay for premium
    if 'premium_sub_willingness' in cluster_data.columns:
        willingness = cluster_data.groupby('is_premium')['premium_sub_willingness'].value_counts(normalize=True)
        print("\nWillingness to pay for premium:")
        print(willingness)
    
    # Compare top features
    print("\nTop features for this cluster:")
    for feature in ['music_Influencial_mood', 'fav_genre', 'music_time']:
        if feature in cluster_data.columns:
            print(f"\n{feature}:")
            print(cluster_data[feature].value_counts().head(3))

ValueError: All arrays must be of the same length