In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
country_data = pd.read_csv('country_data.csv')


# In[3]:


# check the dataset overview
country_data.head()


# The dataset consists of 10 variables, with 9 being numeric and 1 categorical. To effectively cluster the data, I will first remove the country column, as it is a categorical variable and does not align with the requirements for clustering. Following this, I will scale the numeric features to optimize the performance of the clustering algorithm. Scaling is crucial as it removes potential biases caused by differing units of measurement across variables, thereby enhancing the consistency and accuracy of the clustering results.

# In[4]:


# check the dataset information
country_data.info()


# In[5]:


# check for duplicate
country_data.duplicated().sum()


# The dataset consists of 167 non-null observations, with each variable correctly assigned to its appropriate data type. Additionally, there are no duplicate entries present, ensuring the dataset's integrity and consistency for analysis.

# ### Data Preprocessing

# In[6]:


# Removing the 'country' column for clustering as it is a categorical identifier
X = country_data.drop(columns=['country'])
X.head()


# In[7]:


# Standardizing the features to improve clustering performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# In[8]:


X_scaled


# ## Build KMeans Clustering Model

# I will apply KMeans clustering to categorize the observations in the dataset into different groups. KMeans is a widely-used unsupervised learning technique that organizes data points into clusters based on their similarities. The goal is to divide the data into a predetermined number of clusters, with each data point assigned to the cluster whose centroid is closest.
# 
# Each predicted cluster represents a potential number of labels for the countries. To assess the quality of these clusters, I will compute the silhouette score for each cluster. Silhouette score is a measure used to evaluate the quality of clusters in clustering algorithms like K-means, hierarchical clustering, or DBSCAN. It quantifies how well each data point fits into its assigned cluster (cohesion) compared to neighboring clusters (separation). Higher silhouette scores imply better clustering.
# 
# 
# The Silhouette Coefficient ranges from -1 to +1 with a value close to +1 indicates that the object is well-matched to its own cluster and poorly matched to neighboring clusters, a value close to 0 indicates that the object is on or very close to the decision boundary between two neighboring clusters while a value close to -1 indicates that the object may have been assigned to the wrong cluster. it is calculated with a formula   \[
#   s(i) = \frac{b(i) - a(i)}{\max(a(i), b(i))}
#   \]
#   Where:
#   - \(a(i)\) is the average distance between point \(i\) and all other points in the same cluster (intra-cluster distance).
#   - \(b(i)\) is the average distance between point \(i\) and all points in the nearest cluster that it is not a part of (nearest-cluster distance).
# 
# 
# Once I have computed the silhouette score for each cluster, I will store them in a variable called Silhouette scores. Then, I will create a line plot of Silhouette scores against the number of clusters. The cluster with the highest silhouette score indicates the optimal number of driver categories.

# ### Define the necessary functions

# In[29]:


def calculate_silhouette_score(X_data, max_clusters):
    # silhoutte score as the metric to get the optimal number of clusters
        
    # create a variable to store the silhoutte score for each cluster
    silhouette_scores = []
    
    for n_clusters in range(2, max_clusters+1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init = 10)
        cluster_labels = kmeans.fit_predict(X_data)  # predict the number of cluster (different labels)
        silhouette_avg = silhouette_score(X_data, cluster_labels)  # Evaluate the quality of clusters in clustering 
        silhouette_scores.append(silhouette_avg)
    return silhouette_scores


# In[30]:


def plot_silhouette_score(silhouette_scores):
    # visualize the silhouette scores 
    plt.plot(range(2, max_clusters+1), silhouette_scores)
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score for each Cluster')


# In[31]:


def print_optimal_cluster(silhouette_scores):
    # Choose the optimal number of clusters with the highest silhouette score
    optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
    print("Optimal number of clusters:", optimal_num_clusters)


# In[32]:


def cluster_dataset(df, X_data, silhouette_scores):
    # Choose the optimal number of clusters with the highest silhouette score
    optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
    
    # Perform K-means clustering with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42, n_init = 10)
    cluster_labels = kmeans.fit_predict(X_data)
    
    # Create a label for each entry in the dataset 
    df['cluster'] = cluster_labels
    
    return df, cluster_labels, kmeans


# In[110]:


def visualise_clusters(X_data, cluster_labels, kmeans):
    plt.figure(figsize = (10,10))
    # Visualize the distribution of each category using just two features for simplicity
    plt.scatter(X_data[:, 0], X_data[:, 1], c=cluster_labels, cmap='Accent_r', s=20)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='x')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('K-means Clustering')


# In[111]:


def analyse_data_cluster(df):
    # Analyzing Cluster Characteristics
    # Descriptive statistics of clusters to understand their characteristics
    cluster_summary = df.groupby('cluster').mean(numeric_only=True)
    print("Cluster Summary:")
    print(cluster_summary)


# ### Cluster Base on Two Features

# In[112]:


# define the number features to use in clustering
num_feature = 2

# define the maximum number of cluster
max_clusters = 10

# extract the number of features of interest from the whole data
X_data = X_scaled[:, :num_feature]

# compute the silhouette score
sil_score = calculate_silhouette_score(X_data, max_clusters = max_clusters)


# In[113]:


# plot silhouette score to get the optimal number of cluster 
plot_silhouette_score(silhouette_scores = sil_score)


# In[114]:


# print optimal number of cluster obtained
print_optimal_cluster(sil_score)


# In[115]:


# assign each observation to its cluster 
df, cluster_labels, kmeans = cluster_dataset(df = country_data, X_data = X_data, silhouette_scores = sil_score)


# In[116]:


# visualize the cluster using the first two features
visualise_clusters(X_data = X_data, cluster_labels= cluster_labels, kmeans= kmeans)


# In[117]:


# calculate the descriptive statistics of each cluster to see how they are compare
analyse_data_cluster(df)


# ### Cluster base on three features

# In[119]:


# define the number features to use in clustering
num_feature = 3

# define the maximum number of cluster
max_clusters = 10

# extract the number of features of interest from the whole data
X_data = X_scaled[:, :num_feature]

# compute the silhouette score
sil_score = calculate_silhouette_score(X_data, max_clusters = max_clusters)

# plot silhouette score to get the optimal number of cluster 
plot_silhouette_score(silhouette_scores = sil_score)

# print optimal number of cluster obtained
print_optimal_cluster(sil_score)

# assign each observation to its cluster 
df, cluster_labels, kmeans = cluster_dataset(df = country_data, X_data = X_data, silhouette_scores = sil_score)


# In[120]:


# visualize the cluster using the first two features
visualise_clusters(X_data = X_data, cluster_labels= cluster_labels, kmeans= kmeans)


# In[121]:


# calculate the descriptive statistics of each cluster to see how they are compare
analyse_data_cluster(df)


# ### Cluster base on four features

# In[122]:


# define the number features to use in clustering
num_feature = 4

# define the maximum number of cluster
max_clusters = 10

# extract the number of features of interest from the whole data
X_data = X_scaled[:, :num_feature]

# compute the silhouette score
sil_score = calculate_silhouette_score(X_data, max_clusters = max_clusters)

# plot silhouette score to get the optimal number of cluster 
plot_silhouette_score(silhouette_scores = sil_score)

# print optimal number of cluster obtained
print_optimal_cluster(sil_score)

# assign each observation to its cluster 
df, cluster_labels, kmeans = cluster_dataset(df = country_data, X_data = X_data, silhouette_scores = sil_score)


# In[123]:


# visualize the cluster using the first two features
visualise_clusters(X_data = X_data, cluster_labels= cluster_labels, kmeans= kmeans)


# In[124]:


# calculate the descriptive statistics of each cluster to see how they are compare
analyse_data_cluster(df)


# ### Cluster by 7 features

# In[128]:


# define the number features to use in clustering
num_feature = 7

# define the maximum number of cluster
max_clusters = 10

# extract the number of features of interest from the whole data
X_data = X_scaled[:, :num_feature]

# compute the silhouette score
sil_score = calculate_silhouette_score(X_data, max_clusters = max_clusters)

# plot silhouette score to get the optimal number of cluster 
plot_silhouette_score(silhouette_scores = sil_score)

# print optimal number of cluster obtained
print_optimal_cluster(sil_score)

# assign each observation to its cluster 
df, cluster_labels, kmeans = cluster_dataset(df = country_data, X_data = X_data, silhouette_scores = sil_score)


# In[129]:


# visualize the cluster using the first two features
visualise_clusters(X_data = X_data, cluster_labels= cluster_labels, kmeans= kmeans)


# In[130]:


# calculate the descriptive statistics of each cluster to see how they are compare
analyse_data_cluster(df)


# ### Cluster by all the features

# In[125]:


# define the number features to use in clustering
num_feature = 10

# define the maximum number of cluster
max_clusters = 10

# extract the number of features of interest from the whole data
X_data = X_scaled[:, :num_feature]

# compute the silhouette score
sil_score = calculate_silhouette_score(X_data, max_clusters = max_clusters)

# plot silhouette score to get the optimal number of cluster 
plot_silhouette_score(silhouette_scores = sil_score)

# print optimal number of cluster obtained
print_optimal_cluster(sil_score)

# assign each observation to its cluster 
df, cluster_labels, kmeans = cluster_dataset(df = country_data, X_data = X_data, silhouette_scores = sil_score)


# In[126]:


# visualize the cluster using the first two features
visualise_clusters(X_data = X_data, cluster_labels= cluster_labels, kmeans= kmeans)


# In[127]:


# calculate the descriptive statistics of each cluster to see how they are compare
analyse_data_cluster(df)


# In[ ]:





# In[ ]: