Реалізувати алгоритм k-means. На вхід додатку передається CSV файл з даними. Результати категоризації записуються у файл.

In [5]:
import pandas as pd
import math

df = pd.read_csv('./data/salaries_by_college_major.csv')
df = df.dropna()

In [6]:
feature_cols = [
    'Starting Median Salary', 
    'Mid-Career Median Salary', 
    'Mid-Career 10th Percentile Salary', 
    'Mid-Career 90th Percentile Salary'
]

data_points = df[feature_cols].values.tolist()
k = 3

In [7]:
def euclidean_dist(p1, p2):
    sum_sq_diff = 0
    for i in range(len(p1)):
        sum_sq_diff += (p1[i] - p2[i]) ** 2
    return math.sqrt(sum_sq_diff)

def calculate_centroid(cluster_points):
    if not cluster_points:
        return []
    n_dim = len(cluster_points[0])
    n_points = len(cluster_points)
    new_centroid = [0] * n_dim
    
    for point in cluster_points:
        for i in range(n_dim):
            new_centroid[i] += point[i]
    
    for i in range(n_dim):
        new_centroid[i] /= n_points
        
    return new_centroid

def k_means(data, k, max_iterations=100):
    centroids = data[:k]
    
    for _ in range(max_iterations):
        clusters = [[] for _ in range(k)]
        labels = []
        
        for point in data:
            distances = [euclidean_dist(point, centroid) for centroid in centroids]
            closest_idx = distances.index(min(distances))
            clusters[closest_idx].append(point)
            labels.append(closest_idx)
        
        new_centroids = []
        for i in range(k):
            if clusters[i]:
                new_centroids.append(calculate_centroid(clusters[i]))
            else:
                new_centroids.append(centroids[i])
        
        is_converged = True
        for i in range(k):
            if euclidean_dist(centroids[i], new_centroids[i]) > 1e-4:
                is_converged = False
                break
        
        centroids = new_centroids
        if is_converged:
            break
            
    return labels

In [8]:
cluster_labels = k_means(data_points, k)

df['Cluster'] = cluster_labels

df.to_csv("salaries_clustered.csv", index=False)