In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
import random
import os

In [2]:
dataset = "sample.csv"
df = pd.read_csv(dataset)

In [3]:
# Selecting all number dtypes
features = df.select_dtypes(np.number).columns

In [4]:
# Scaling data for better model performance 
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

In [5]:
# Encoding artist column
encoder = OneHotEncoder(sparse_output = False)
encoded_data = encoder.fit_transform(df[['artist']])

# Weighting artist feature
weight_factor = 3
encoded_data *= weight_factor

# changing to DataFrame 
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(['artist']))

# Concatinating encoded dataframe
df_encoded = pd.concat([df, encoded_df], axis = 1)

# updating features to include artist
features = df_encoded.select_dtypes(np.number).columns

In [6]:
# Model Training
k = 10 # high number of cluster for more tailored recommendation
kmeans = KMeans(n_clusters = k, max_iter = 300, random_state = 0)
y_kmeans = kmeans.fit_predict(scaled_features)



In [7]:
df['Cluster'] = y_kmeans

In [8]:
# saving the cluster data
file, ext = os.path.splitext(dataset)
clustered_file = f'clustered_{file}{ext}'
df.to_csv(clustered_file, index = False) 