## Importation ##

In [70]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from sklearn import manifold
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
import joblib

## Data loading ##

In [71]:
data = pd.read_csv("data/data.csv")
data = shuffle(data, random_state=42)
data = data.head(5000)

## Preprocessing ##

In [72]:
columns = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

df = pd.DataFrame(data, columns=columns)

#drop_columns = ['valence', 'year', 'artists', 'duration_ms', 'explicit', 'id', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

drop_columns = ['artists', 'id', 'name', 'release_date']

scaler = MinMaxScaler()
df['year_scaled'] = scaler.fit_transform(df[['year']]) 
df['duration_ms'] = scaler.fit_transform(df[['duration_ms']])
df['key'] = scaler.fit_transform(df[['key']])
df['loudness'] = scaler.fit_transform(df[['loudness']])
df['popularity'] = scaler.fit_transform(df[['popularity']])
df['tempo'] = scaler.fit_transform(df[['tempo']])

ids = df['id'].tolist()
df = df.drop(columns=['id']+drop_columns, errors='ignore')




print(df)

        valence  year  acousticness  danceability  duration_ms  energy  \
116368    0.910  1977        0.5000         0.894     0.071539  0.5210   
161935    0.583  1976        0.9300         0.585     0.045802  0.2720   
135703    0.647  1994        0.1570         0.878     0.045160  0.5200   
112288    0.196  1957        0.9370         0.463     0.256456  0.2870   
22        0.422  1921        0.9950         0.648     0.037883  0.0995   
...         ...   ...           ...           ...          ...     ...   
121224    0.612  2001        0.1840         0.533     0.046905  0.7030   
166328    0.919  1999        0.0819         0.714     0.050976  0.8520   
121731    0.345  2012        0.8820         0.215     0.041707  0.3190   
59641     0.286  1942        0.9900         0.393     0.047238  0.0747   
110754    0.633  1949        0.6410         0.449     0.019469  0.4510   

        explicit  instrumentalness       key  liveness  loudness  mode  \
116368         0          0.772000  0

## Variables ##

In [73]:
inertia = []
silhouette_scores = []
K = list(range(2, 30))
random_state = 42

#X = shuffle(df, random_state=random_state)
# X = X[0:5000]
# print(X)
X = df

## Determine Optimal K ##

In [74]:
print("Determine Optimal K")
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=random_state)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))
    print(f"value of k: {k}")

# #Plot the Elbow Method
# plt.figure(figsize=(10, 5))
# plt.plot(K, inertia, 'bo-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Inertia')
# plt.title('Elbow Method For Optimal k')
# plt.show()
# 
# #Plot the Silhouette Method
# plt.figure(figsize=(10, 5))
# plt.plot(K, silhouette_scores, 'bo-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Silhouette Method For Optimal k')
# plt.show()

# Determine the optimal number of clusters
optimal_k = 0
for i in range(len(inertia)):
    if i == len(inertia) - 1 and optimal_k == 0:
        optimal_k = K[i]
    elif i != 0:
        print(f"dif = {abs(inertia[i-1] - inertia[i])}")
        print(f"inertia[{i}]*0.1 = {inertia[i] *0.1}")
        if abs(inertia[i-1] - inertia[i]) < inertia[i]*0.1:
            optimal_k = K[i]
            break

print(f"The optimal number of clusters is {optimal_k}")


kmean = KMeans(n_clusters=optimal_k, random_state=random_state)
kmean.fit(X)

joblib.dump(kmean, 'kmean_model.pkl')

clusters = kmean.predict(X)

with open('ids.txt', 'w') as file:
    for i in range(len(ids)):
        file.write(f"{ids[i]} {clusters[i]}\n")

# Plot the data points with their cluster
# plt.figure(figsize=(10, 7))
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# plt.title('Data points and their clusters')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.show()

# %matplotlib notebook
# fig = plt.figure(figsize=(10, 7))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(X.iloc[:, 0], X.iloc[:, 1], X.iloc[:, 2], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# ax.set_title('Data points and their clusters')
# ax.set_xlabel('Feature 1')
# ax.set_ylabel('Feature 2')
# ax.set_zlabel('Feature 3')
# plt.show()


Determine Optimal K
value of k: 2
value of k: 3
value of k: 4
value of k: 5
value of k: 6
value of k: 7
value of k: 8
value of k: 9
value of k: 10
value of k: 11
value of k: 12
value of k: 13
value of k: 14
value of k: 15
value of k: 16
value of k: 17
value of k: 18
value of k: 19
value of k: 20
value of k: 21
value of k: 22
value of k: 23
value of k: 24
value of k: 25
value of k: 26
value of k: 27
value of k: 28
value of k: 29
dif = 493088.67954788473
inertia[1]*0.1 = 41925.943967740845
dif = 165534.65045790264
inertia[2]*0.1 = 25372.47892195058
dif = 93518.77063569566
inertia[3]*0.1 = 16020.601858381015
dif = 50298.086436833706
inertia[4]*0.1 = 10990.793214697644
dif = 25175.118717286925
inertia[5]*0.1 = 8473.28134296895
dif = 20755.84555165325
inertia[6]*0.1 = 6397.696787803626
dif = 11134.858835336017
inertia[7]*0.1 = 5284.210904270024
dif = 10142.165385625434
inertia[8]*0.1 = 4269.99436570748
dif = 5707.813061608453
inertia[9]*0.1 = 3699.213059546635
dif = 6666.421982184682
inerti

## ma valeur ##

In [75]:
my_value = [0.85, 0.51, 0.10, 0.9, 0.14,0,0.74,0.56,0.1,0,0.3,0.75,0.6,0,0.8, 0.1]

## Preciction ##

In [76]:
elements_in_cluster = []

def get_cluster_info(data_point, elements_in_cluster):
    # Ensure data_point is a DataFrame with the same columns as the original data
    
    data_point_df = pd.DataFrame([data_point], columns=df.columns)    

    # Predict the cluster for the given data point
    data_point_cluster = kmean.predict(data_point_df)[0]
    
    #elements_in_cluster = X[clusters == data_point_cluster]
    for index in range(len(clusters)):
        if clusters[index] == data_point_cluster:
            print(f"index: {index} et id: {ids[index]}")
            elements_in_cluster.append(ids[index])


    return data_point_cluster

data_point = my_value
cluster = get_cluster_info(data_point, elements_in_cluster)
print(f"Elements in cluster {cluster}:\n{elements_in_cluster}")

index: 4 et id: 0SK1upzAP6NvIgF0uGh6z2
index: 72 et id: 10NokhLWqxzi6Qg10vcGQ5
index: 75 et id: 6Sza4f5kWjsfIAxzThgPRa
index: 120 et id: 6wNFn9EiP2oa9fxV6FgARk
index: 268 et id: 4U4z1JcAL4f3E4YYitRyMX
index: 438 et id: 13ZrfHNfVnzIZlXyetXke3
index: 455 et id: 7JqEsyJDSlGjg95DDk1Q71
index: 514 et id: 6tdZSytSJjP1hxkRBD8gVA
index: 605 et id: 5cIES4mu3rdDSNQ7aluQKz
index: 664 et id: 7agPeo1jNiA5OEXGFDzl7I
index: 667 et id: 5waV2tHn9zLn6ot0wzIw9c
index: 701 et id: 5jzNqmHq9b7xxr6GGFpFwm
index: 752 et id: 0m2zzAhHz4DZHXYh6m0h0e
index: 762 et id: 0YFDGOIR9ubDqWCvsUQKY0
index: 790 et id: 5YZpAH2jWtOOvbe1jv936Q
index: 835 et id: 0yblbywxeP28xmOxSrcUYk
index: 844 et id: 1Lbm5IC3QN56LtoyvQMUpH
index: 866 et id: 6XSm7HngoGvLlOViuMQfAW
index: 951 et id: 34rdVyVGI2u6I0tNWc8v0Y
index: 1044 et id: 5xoKOYyS2kCzxb9docXLaY
index: 1075 et id: 75FDlSK7dHKU6TtMGrBFXK
index: 1155 et id: 6q9IP7wbfpocUiOEGvQqCZ
index: 1318 et id: 2paUNK969mjN9A8cXLnHfp
index: 1371 et id: 4ywiiUqjEvx8RufPznNY8K
index: 1397 et 