## Importation ##

In [47]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from sklearn import manifold
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
import joblib

## Data loading ##

In [48]:
data = pd.read_csv("data/data.csv")
data = shuffle(data, random_state=42)
data = data.head(20000)

## Preprocessing ##

In [49]:
columns = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

df = pd.DataFrame(data, columns=columns)

#drop_columns = ['valence', 'year', 'artists', 'duration_ms', 'explicit', 'id', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

drop_columns = ['artists', 'id', 'name', 'release_date','year']

scaler = MinMaxScaler()
df['year'] = scaler.fit_transform(df[['year']]) 
df['duration_ms'] = scaler.fit_transform(df[['duration_ms']])
df['key'] = scaler.fit_transform(df[['key']])
df['loudness'] = scaler.fit_transform(df[['loudness']])
df['popularity'] = scaler.fit_transform(df[['popularity']])
df['tempo'] = scaler.fit_transform(df[['tempo']])

ids = df['id'].tolist()
df = df.drop(columns=drop_columns, errors='ignore')




print(df)

        valence  acousticness  danceability  duration_ms  energy  explicit  \
116368    0.910        0.5000         0.894     0.075566  0.5210         0   
161935    0.583        0.9300         0.585     0.049941  0.2720         0   
135703    0.647        0.1570         0.878     0.049301  0.5200         1   
112288    0.196        0.9370         0.463     0.259681  0.2870         0   
22        0.422        0.9950         0.648     0.042056  0.0995         0   
...         ...           ...           ...          ...     ...       ...   
9320      0.690        0.8070         0.691     0.040307  0.4300         0   
49182     0.284        0.8770         0.730     0.091832  0.2850         0   
151787    0.691        0.0459         0.555     0.055694  0.6690         0   
126399    0.538        0.9710         0.500     0.054804  0.3230         0   
5448      0.941        0.2640         0.601     0.039104  0.6580         0   

        instrumentalness       key  liveness  loudness  mode  p

## Variables ##

In [50]:
inertia = []
silhouette_scores = []
K = list(range(2, 40))
random_state = 42
X = df

## Determine Optimal K ##

In [51]:
print("Determine Optimal K")
optimal_k = 0
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=random_state)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))
    print(f"value of k: {k}")
    
    if len(inertia) >=3 and optimal_k == 0:
        if abs(inertia[-2] - inertia[-1]) < inertia[-1]*0.05 and abs(inertia[-3] - inertia[-2]) < inertia[-2]*0.05:
            optimal_k = k
            break
    
    elif len(inertia) == 2 and optimal_k == 0:
        if abs(inertia[-2] - inertia[-1]) < inertia[-1]*0.05:
            optimal_k = k
            break
    
print(f"optimal_k: {optimal_k}")


# # #Plot the Elbow Method
# plt.figure(figsize=(7, 4))
# plt.plot(K, inertia, 'bo-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Inertia')
# plt.title('Elbow Method For Optimal k')
# plt.show()
# # 
# # #Plot the Silhouette Method
# plt.figure(figsize=(7, 4))
# plt.plot(K, silhouette_scores, 'bo-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Silhouette Method For Optimal k')
# plt.show()

# Determine the optimal number of clusters

print(f"The optimal number of clusters is {optimal_k}")


kmean = KMeans(n_clusters=optimal_k, random_state=random_state)
kmean.fit(X)

joblib.dump(kmean, 'kmean_model.pkl')

clusters = kmean.predict(X)

with open('ids.txt', 'w') as file:
    for i in range(len(ids)):
        file.write(f"{ids[i]} {clusters[i]}\n")

# #Plot the data points with their cluster
# plt.figure(figsize=(10, 7))
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# plt.title('Data points and their clusters')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.show()
# 
# #%matplotlib notebook
# fig = plt.figure(figsize=(10, 7))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(X.iloc[:, 0], X.iloc[:, 1], X.iloc[:, 2], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# ax.set_title('Data points and their clusters')
# ax.set_xlabel('Feature 1')
# ax.set_ylabel('Feature 2')
# ax.set_zlabel('Feature 3')
# plt.show()


Determine Optimal K
value of k: 2
value of k: 3
value of k: 4
value of k: 5
value of k: 6
value of k: 7
value of k: 8
value of k: 9
value of k: 10
value of k: 11
value of k: 12
optimal_k: 12
The optimal number of clusters is 12


## ma valeur ##

In [52]:
my_value = [0.85, 0.51, 0.9, 0.14,0,0.74,0.56,0.1,0,0.3,0.75,0,0.8, 0.1]

## Preciction ##

In [53]:
elements_in_cluster = []

def get_cluster_info(data_point, elements_in_cluster):
    # Ensure data_point is a DataFrame with the same columns as the original data
    
    data_point_df = pd.DataFrame([data_point], columns=df.columns)    

    # Predict the cluster for the given data point
    data_point_cluster = kmean.predict(data_point_df)[0]
    
    #elements_in_cluster = X[clusters == data_point_cluster]
    for index in range(len(clusters)):
        if clusters[index] == data_point_cluster:
            print(f"index: {index} et id: {ids[index]}")
            elements_in_cluster.append(ids[index])


    return data_point_cluster

data_point = my_value
cluster = get_cluster_info(data_point, elements_in_cluster)
print(f"Elements in cluster {cluster}:\n{elements_in_cluster}")

index: 2 et id: 1X07ZfF5KQt8dyJ5kAkVAQ
index: 5 et id: 16aNJYinJv9AAetWD5yJen
index: 9 et id: 0XGcXc6VkB5dx6RNWxV0rF
index: 15 et id: 6i4Gu9ogL4kClgBzj3NZ6g
index: 48 et id: 1m8cwYJxSWNGxscZQOzQ7F
index: 78 et id: 3t4dLWhNwGWIQsL01fdo6x
index: 110 et id: 2QgfDF0fQ4sskDthP8MG5w
index: 126 et id: 67XyC1GdSkKb9sbRRkUDXS
index: 196 et id: 0E8R8Ro8cRouDHCJFbsbP1
index: 199 et id: 04rVHE9pBDS0Cg1svLn3qX
index: 219 et id: 5rjHtZLcvsvBBcgwHsGYzG
index: 239 et id: 71mDjdeVK9sqXmamMHaAfX
index: 265 et id: 2LyYvjkKkchY8JXKu6m5Ri
index: 267 et id: 4gfwOcBc4H5cl6wVBGu7h2
index: 300 et id: 4rpfA2rFOC7FMGxFNktB0i
index: 321 et id: 0bhnUFDQFCEHCBlsQfHnj5
index: 325 et id: 67HRSCGK3ZQIxb3Spr1cHQ
index: 352 et id: 5VuxWXbt7XENQCtE9TzpTv
index: 379 et id: 2lRe5wBRm4xaSKTbDn2vLD
index: 387 et id: 6br4Pc0BzU6dTD5fEmks2i
index: 397 et id: 5DI9jxTHrEiFAhStG7VA8E
index: 403 et id: 561jH07mF1jHuk7KlaeF0s
index: 405 et id: 4faiJXyBflUVVOOE9fxbeg
index: 406 et id: 0tP7W6M6JHtnhX92Mk9RAS
index: 413 et id: 1JJW0dK