Реалізувати ієрархічний алгоритм категорізації знизу-догори. На вхід додатку передається CSV файл з даними. Результати категорізації записуються у файл.

Порівняти якість категоризації з алгоритмом k-means.

In [4]:
import pandas as pd
import math
import random

df = pd.read_csv('./data/salaries_by_college_major.csv')
df = df.dropna()

In [5]:
feature_cols = [
    'Starting Median Salary', 
    'Mid-Career Median Salary', 
    'Mid-Career 10th Percentile Salary', 
    'Mid-Career 90th Percentile Salary'
]

k = 3
k_n = k

In [6]:

df_norm = df.copy()
for col in feature_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    df_norm[col] = (df[col] - min_val) / (max_val - min_val)

data_points = df_norm[feature_cols].values.tolist()


def euclidean_distance(p1, p2):
    return math.sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

def calculate_centroid(indices, all_data):
    if not indices: return []
    dim = len(all_data[0])
    n = len(indices)
    centroid = [0.0] * dim
    for idx in indices:
        point = all_data[idx]
        for i in range(dim):
            centroid[i] += point[i]
    return [x / n for x in centroid]

def calculate_wcss(clusters, all_data):
    wcss = 0.0
    for indices in clusters.values():
        centroid = calculate_centroid(indices, all_data)
        for idx in indices:
            wcss += euclidean_distance(all_data[idx], centroid) ** 2
    return wcss

def hierarchical_clustering(data, k):
    current_clusters = {i: [i] for i in range(len(data))}
    cluster_centroids = {i: data[i] for i in range(len(data))}
    
    while len(current_clusters) > k:
        min_dist = float('inf')
        pair_to_merge = (-1, -1)
        
        cluster_ids = list(current_clusters.keys())
        for i in range(len(cluster_ids)):
            id1 = cluster_ids[i]
            c1 = cluster_centroids[id1]
            for j in range(i + 1, len(cluster_ids)):
                id2 = cluster_ids[j]
                c2 = cluster_centroids[id2]
                
                dist = euclidean_distance(c1, c2)
                if dist < min_dist:
                    min_dist = dist
                    pair_to_merge = (id1, id2)
        
        id1, id2 = pair_to_merge
        new_indices = current_clusters[id1] + current_clusters[id2]
        
        del current_clusters[id1]
        del current_clusters[id2]
        del cluster_centroids[id1]
        del cluster_centroids[id2]
        
        new_id = max(cluster_ids) + 1
        current_clusters[new_id] = new_indices
        cluster_centroids[new_id] = calculate_centroid(new_indices, data)
        
    return current_clusters

def k_means_clustering(data, k, max_iters=100):
    random.seed(42)
    initial_indices = random.sample(range(len(data)), k)
    centroids = [data[i] for i in initial_indices]
    cluster_assignments = {}
    
    for _ in range(max_iters):
        new_assignments = {i: [] for i in range(k)}
        for data_idx, point in enumerate(data):
            min_dist = float('inf')
            closest = -1
            for c_idx, centroid in enumerate(centroids):
                dist = euclidean_distance(point, centroid)
                if dist < min_dist:
                    min_dist = dist
                    closest = c_idx
            new_assignments[closest].append(data_idx)
        
        new_centroids = []
        max_shift = 0.0
        for c_idx in range(k):
            indices = new_assignments[c_idx]
            if indices:
                new_c = calculate_centroid(indices, data)
                new_centroids.append(new_c)
                max_shift = max(max_shift, euclidean_distance(centroids[c_idx], new_c))
            else:
                new_centroids.append(centroids[c_idx])
        
        centroids = new_centroids
        cluster_assignments = new_assignments
        if max_shift < 1e-6: break
            
    return cluster_assignments

h_clusters = hierarchical_clustering(data_points, k_n)
km_clusters = k_means_clustering(data_points, k_n)

df['Cluster_Hierarchical'] = -1
df['Cluster_KMeans'] = -1

for label, (cid, indices) in enumerate(h_clusters.items()):
    for idx in indices: df.at[idx, 'Cluster_Hierarchical'] = label

for label, indices in km_clusters.items():
    for idx in indices: df.at[idx, 'Cluster_KMeans'] = label

In [7]:
df.to_csv('categorization_results.csv', index=False)

print(f"Hierarchical WCSS: {calculate_wcss(h_clusters, data_points)}")
print(f"K-Means WCSS: {calculate_wcss(km_clusters, data_points)}")

Hierarchical WCSS: 4.471136443435773
K-Means WCSS: 3.080148612900569
