IMPORTS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm

FUNCTIONS

In [2]:
def load_and_prepare_data(filepath):
    apples_df = pd.read_csv(filepath)
    apples_df = apples_df.drop(columns=['A_id'])
    apples_df = pd.get_dummies(apples_df)
    return apples_df

def standardize_data(df):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df.values)
    return X_scaled

def plot_elbow(X, max_clusters=20):
    inertia = []
    K = range(1, max_clusters + 1)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
def apply_kmeans(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    return kmeans, labels

def plot_clusters(X, labels, kmeans):
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    plt.figure(figsize=(10, 7))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('K-means Clustering Visualization')
    plt.colorbar(label='Cluster')
    plt.show()

def display_cluster_info(df, kmeans, labels):
    centroids = kmeans.cluster_centers_
    centroids_df = pd.DataFrame(centroids, columns=df.columns[:-1])  # Exclude the 'Cluster' column
    cluster_summary = df.groupby('Cluster').mean()
    silhouette_avg = silhouette_score(X_scaled, labels)
    print(f'The silhouette score for the clustering is: {silhouette_avg:.4f}')
    return centroids_df, cluster_summary, silhouette_avg


def plot_silhouette(X, labels, n_clusters):
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(10, 7)
    
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
    
    sample_silhouette_values = silhouette_samples(X, labels)
    
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
        ith_cluster_silhouette_values.sort()
        
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)
        
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10
    
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks(np.arange(-0.1, 1.1, 0.2))
    
    plt.show()

PREPROCESS DATA

In [4]:
# Load and prepare data
apples_df = load_and_prepare_data('../datasets/apple_quality.csv')

# Standardize the data
X_scaled = standardize_data(apples_df)

KeyError: "['A_id'] not found in axis"

ELBOW PLOT (check optimal number of K)

In [None]:
# Plot the elbow graph to find the optimal number of clusters
plot_elbow(X_scaled)

APPLYING K MEANS 

In [None]:
k_optimal=2
    
# Apply K-means clustering
kmeans, labels = apply_kmeans(X_scaled, k_optimal)
apples_df['Cluster'] = labels

RESULTS

In [None]:
# Plot the clusters
plot_clusters(X_scaled, labels, kmeans)

# Display cluster info
centroids_df, cluster_summary, silhouette_avg = display_cluster_info(apples_df, kmeans, labels)

# Show the first few rows of the dataframe with the cluster labels
print(apples_df.head())

# Plot silhouette analysis
plot_silhouette(X_scaled, labels, k_optimal)

# Group the original data by the cluster labels and calculate mean values for each cluster
cluster_summary = apples_df.groupby('Cluster').mean()
print(cluster_summary)

In [None]:
cluster_summary

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_scaled)

plt.figure(figsize=(10, 7))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis')
plt.xlabel('TSNE Component 1')
plt.ylabel('TSNE Component 2')
plt.title('Kmeans Clustering Visualization with t-SNE')
plt.colorbar(label='Cluster')
plt.show()