# DATA SCIENCE PROJECT - Unsupervised Machine Learning
## Antoine GOULARD - Reda FALAKI - Capucine FOUCHER

### Importing Libraries

In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage

### Step 1 - Load the Data

In [None]:
# Importing the dataset
X = pd.read_csv("cleaned_data.csv")

### Step 2 - Instantiate the model chosen

In [None]:
# Instantiate the KMeans model with the desired number of clusters
kmeans = KMeans(n_clusters=5, random_state=42)

In [None]:
# Perform hierarchical clustering
linkage_matrix = linkage(X, method='ward', metric='euclidean')

### Step 3 - Fit the model to the data

In [None]:
# Fit the model to your data
kmeans.fit(X)

### Step 4 - Get the clusters information

In [None]:
# Get the cluster labels for each data point
cluster_labels = kmeans.labels_
# Get the coordinates of the cluster centers
cluster_centers = kmeans.cluster_centers_

### Step 5 - Plot the clusters

In [None]:
# Visualize the clusters
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker='x', color='red')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('K-means Clustering')
plt.show()

In [None]:
# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Data points')
plt.ylabel('Distance')
plt.show()

### Step 6 - Interpret the results obtained

In [None]:
# Interpret the clusters
for i in range(kmeans.n_clusters):
    cluster_data = X[cluster_labels == i]
    print('Cluster', i+1)
    print('Number of data points:', len(cluster_data))
    print('Cluster center coordinates:', cluster_centers[i])
    print('-------------------------------------')