# K-Means klusteranalys av funktionsegenskaper

I den här notebooken analyserar vi ett dataset med fyra egenskaper. Vi utför K-Means klusteranalys för varje par av egenskaper för att hitta det optimala antalet kluster baserat på silhuettmetoden.


In [None]:
import pandas as pd

data_path = 'smal_synthetic_cluster.csv'
data = pd.read_csv(data_path)
data.head()


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

def optimal_kmeans(data, max_clusters=10):
    silhouette_scores = []
    k_values = range(2, max_clusters + 1)
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, clusters)
        silhouette_scores.append(silhouette_avg)
    optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
    return optimal_k, silhouette_scores

def plot_clusters(data, title):
    plt.figure(figsize=(8, 6))
    plt.scatter(data[:, 0], data[:, 1], c=kmeans.labels_, cmap='viridis')
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75)
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

features_pairs = [('Feature1', 'Feature2'), ('Feature1', 'Feature3'), ('Feature1', 'Feature4'),
                  ('Feature2', 'Feature3'), ('Feature2', 'Feature4'), ('Feature3', 'Feature4')]

for pair in features_pairs:
    subset_data = data[list(pair)].values
    optimal_k, silhouette_scores = optimal_kmeans(subset_data)
    kmeans = KMeans(n_clusters=optimal_k, random_state=42).fit(subset_data)
    plot_title = f"Optimal K = {optimal_k} for {pair[0]} and {pair[1]}"
    plot_clusters(subset_data, plot_title)
