## Importation ##

In [11]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
import joblib

## Data loading ##

In [12]:
data = pd.read_csv("data/data_appended.csv")
data = shuffle(data, random_state=42)

## Preprocessing ##

In [13]:
columns = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'speechiness', 'tempo','genre']

df = pd.DataFrame(data, columns=columns)

#drop_columns = ['valence', 'year', 'artists', 'duration_ms', 'explicit', 'id', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']

drop_columns = ['artists', 'id', 'name', 'release_date','year','genre']

scaler = MinMaxScaler()
df['year'] = scaler.fit_transform(df[['year']]) 
df['duration_ms'] = scaler.fit_transform(df[['duration_ms']])
df['key'] = scaler.fit_transform(df[['key']])
df['loudness'] = scaler.fit_transform(df[['loudness']])
df['popularity'] = scaler.fit_transform(df[['popularity']])
df['tempo'] = scaler.fit_transform(df[['tempo']])

ids = df['id'].tolist()
df = df.drop(columns=drop_columns, errors='ignore')




print(df)

        valence  acousticness  danceability  duration_ms  energy  explicit  \
260762   0.2260        0.1290         0.591     0.039399  0.6660         1   
115940   0.3410        0.9370         0.594     0.041753  0.2250         0   
169562   0.4830        0.0497         0.391     0.058361  0.7450         0   
154918   0.0000        0.9960         0.000     0.035672  0.0197         0   
61875    0.5390        0.7060         0.686     0.069536  0.3880         0   
...         ...           ...           ...          ...     ...       ...   
119879   0.1620        0.4940         0.260     0.058681  0.3280         0   
259178   0.3330        0.9850         0.298     0.034751  0.1850         0   
131932   0.6060        0.9770         0.494     0.076763  0.7430         0   
146867   0.0402        0.9740         0.273     0.025543  0.0315         0   
121958   0.4120        0.0853         0.479     0.037332  0.6890         1   

        instrumentalness       key  liveness  loudness  mode  p

## Variables ##

In [14]:
inertia = []
silhouette_scores = []
K = list(range(2, 40))
random_state = 42
X = df
print(X)


        valence  acousticness  danceability  duration_ms  energy  explicit  \
260762   0.2260        0.1290         0.591     0.039399  0.6660         1   
115940   0.3410        0.9370         0.594     0.041753  0.2250         0   
169562   0.4830        0.0497         0.391     0.058361  0.7450         0   
154918   0.0000        0.9960         0.000     0.035672  0.0197         0   
61875    0.5390        0.7060         0.686     0.069536  0.3880         0   
...         ...           ...           ...          ...     ...       ...   
119879   0.1620        0.4940         0.260     0.058681  0.3280         0   
259178   0.3330        0.9850         0.298     0.034751  0.1850         0   
131932   0.6060        0.9770         0.494     0.076763  0.7430         0   
146867   0.0402        0.9740         0.273     0.025543  0.0315         0   
121958   0.4120        0.0853         0.479     0.037332  0.6890         1   

        instrumentalness       key  liveness  loudness  mode  p

## Determine Optimal K ##

In [None]:
from tqdm import tqdm

print("Determine Optimal K")
# optimal_k = 0
for k in tqdm(K, desc="Calculating optimal k"):
    kmeans = KMeans(n_clusters=k, random_state=random_state)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))
    print(f"value of k: {k}")

    # if len(inertia) >= 3 and optimal_k == 0:
    #     if abs(inertia[-2] - inertia[-1]) < inertia[-1] * 0.05 and abs(inertia[-3] - inertia[-2]) < inertia[-2] * 0.05:
    #         optimal_k = k
    #         break
    # 
    # elif len(inertia) == 2 and optimal_k == 0:
    #     if abs(inertia[-2] - inertia[-1]) < inertia[-1] * 0.05:
    #         optimal_k = k
    #         break

# print(f"optimal_k: {optimal_k}")


# # #Plot the Elbow Method
plt.figure(figsize=(7, 4))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()
# # 
# # #Plot the Silhouette Method
plt.figure(figsize=(7, 4))
plt.plot(K, silhouette_scores, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal k')
plt.show()

# Determine the optimal number of clusters

# print(f"The optimal number of clusters is {optimal_k}")




In [16]:
optimal_k = 13

kmean = KMeans(n_clusters=optimal_k, random_state=random_state)
kmean.fit(X)

joblib.dump(kmean, f'kmean_model_k_{optimal_k}.pkl')


clusters = kmean.predict(X)

# with open('ids.txt', 'w') as file:
#     for i in range(len(ids)):
#         file.write(f"{ids[i]} {clusters[i]}\n")

# #Plot the data points with their cluster
# plt.figure(figsize=(10, 7))
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# plt.title('Data points and their clusters')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.show()
# 
# #%matplotlib notebook
# fig = plt.figure(figsize=(10, 7))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(X.iloc[:, 0], X.iloc[:, 1], X.iloc[:, 2], c=clusters, cmap='viridis', marker='o', edgecolor='k', s=50)
# ax.set_title('Data points and their clusters')
# ax.set_xlabel('Feature 1')
# ax.set_ylabel('Feature 2')
# ax.set_zlabel('Feature 3')
# plt.show()


## ma valeur ##

In [16]:
my_value = [0.85, 0.51, 0.9, 0.14,0,0.74,0.56,0.1,0,0.3,0.75,0,0.8, 0.1]

## Preciction ##

In [17]:
elements_in_cluster = []

def get_cluster_info(data_point, elements_in_cluster):
    # Ensure data_point is a DataFrame with the same columns as the original data
    
    data_point_df = pd.DataFrame([data_point], columns=df.columns)    

    # Predict the cluster for the given data point
    data_point_cluster = kmean.predict(data_point_df)[0]
    
    #elements_in_cluster = X[clusters == data_point_cluster]
    for index in range(len(clusters)):
        if clusters[index] == data_point_cluster:
            print(f"index: {index} et id: {ids[index]}")
            elements_in_cluster.append(ids[index])


    return data_point_cluster

data_point = my_value
cluster = get_cluster_info(data_point, elements_in_cluster)
print(f"Elements in cluster {cluster}:\n{elements_in_cluster}")

index: 0 et id: 4qheoQdPn1nJ7QLGDGCSdb
index: 23 et id: 046k0tkM8kI1D1UbmSsIsW
index: 24 et id: 3wCSVS7KM9CIIHrChpabQ3
index: 28 et id: 1XTGyfJeMiZXrZ1W3NolcB
index: 33 et id: 3qJo0sD5ssJaeHIKAGVObz
index: 34 et id: 3bJVpDpPWDygulO0moI7bg
index: 38 et id: 6MxGvnJWqdGS0chQypGXhB
index: 79 et id: 41oPBI6uXXTh004VizvPnm
index: 84 et id: 5jbDih9bLGmI8ycUKkN5XA
index: 102 et id: 3xJj1mU7B83yop2dA03Smk
index: 126 et id: 4dk6zfG0ohEToKP7h7yQkO
index: 146 et id: 6s1uObcOTyiLlm22qRf12y
index: 152 et id: 6XaqawNmeeScjVLAv4Fx6l
index: 171 et id: 6ZUBqOrjrdmkP4eLQLBeaq
index: 172 et id: 0JofASa5UhVbHrWQv8N48N
index: 190 et id: 4ScVI88WynvZ1avtN2rScK
index: 208 et id: 4JGed8NOXDTHmcwvGuqDiw
index: 210 et id: 19v6qcpbiccxoX8oxv8LlJ
index: 212 et id: 7rSnKi58HPIVC0FVyh04Q1
index: 226 et id: 2b3gxK2X5nI7c0NSRFFDBm
index: 229 et id: 2vBx2Ji6nVNbNalzpxCDWm
index: 241 et id: 0jx8zY5JQsS4YEQcfkoc5C
index: 253 et id: 14Ai4sTxWbQpNymhpQKuES
index: 289 et id: 2T8bL4YC49Ska1yl74qcS0
index: 292 et id: 6P7YTkNb