In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
features = pd.read_csv('features.csv', index_col=0)
features.drop("model_name", inplace=True, axis=1)
features

Unnamed: 0,model,type,hist_type,n_bins,values
0,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,8,"[0, 0, 0, 0, 0.3373978709670946, 0.31850100387..."
1,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,16,"[0, 0, 0, 0, 0, 0, 0, 0, 0.10627104869753085, ..."
2,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,32,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,64,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,128,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
31975,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,8,"[0.0002740011500329959, 0.008458476145636298, ..."
31976,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,16,"[2.7014197890577063e-05, 0.0002469869521424189..."
31977,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,32,"[0, 2.7014197890577063e-05, 7.745907762502199e..."
31978,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,64,"[0, 0, 5.237446529805757e-06, 2.17767513607713..."


In [3]:
features["hists"] = features["values"].apply(lambda list_str: list(map(float, list_str[1:-1].split(","))))
features.drop("values", inplace=True, axis=1)
features

Unnamed: 0,model,type,hist_type,n_bins,hists
0,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,8,"[0.0, 0.0, 0.0, 0.0, 0.3373978709670946, 0.318..."
1,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,16,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.106..."
2,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,32,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,64,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere,model_bounding_sphere_concentric_sphere,128,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
31975,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,8,"[0.0002740011500329959, 0.008458476145636298, ..."
31976,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,16,"[2.7014197890577063e-05, 0.0002469869521424189..."
31977,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,32,"[0.0, 2.7014197890577063e-05, 7.74590776250219..."
31978,255e1024-5967-4247-9de9-b595403f6bc5,Cone,model_bounding_sphere_strict_outer_absolute,64,"[0.0, 0.0, 5.237446529805757e-06, 2.1776751360..."


Давайте сгруппируем гистограммы для моделей следующим образом: возьмём по гистограмме определенного размера каждого типа и сконкатенируем их

In [4]:
def concat_hists(df, hist_len: int):
    return df.groupby(["model", "type"])["hists"].apply(lambda lists: [x for l in lists if len(l) == hist_len for x in l]).reset_index()

In [5]:
# 64 показывает себя лучше всего
prepared = concat_hists(features, hist_len=64)
prepared

Unnamed: 0,model,type,hists
0,0000f8f3-1d1f-4edd-8f25-9304e10090b0,Cone,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017709016653..."
1,00080096-4d79-4d76-93de-ac9104d999b9,Sphere,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,001352ce-6756-4705-b25b-d621ea1e7b71,Cylinder,"[0.0, 0.0, 0.0003640661005972985, 0.0025229862..."
3,0013b5b7-037b-4f26-84b3-dc06c4ae617a,Cube,"[0.0, 0.0, 0.0003400531747549458, 0.0023572393..."
4,00222dd1-5bee-4af5-8f26-6332331fa31d,Cube,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
1594,2789a3b8-fce2-4dee-916c-9ecfcae913f3,Cylinder,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1595,278abcce-26d3-424e-9743-a592a459c49d,Sphere,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1596,278c5c66-8c5d-4884-946c-491d023a92ed,Cone,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1597,2794f6ff-e550-4442-93a2-df7f92b13e74,Torus,"[0.0, 0.00011790809599862798, 0.00626788719410..."


Теперь сделаем кластеризацию

In [6]:
prepared["hists"].map(np.array)

0       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017709016653...
1       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2       [0.0, 0.0, 0.0003640661005972985, 0.0025229862...
3       [0.0, 0.0, 0.0003400531747549458, 0.0023572393...
4       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                              ...                        
1594    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1595    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1596    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1597    [0.0, 0.00011790809599862798, 0.00626788719410...
1598    [0.0, 0.0, 0.0028675578616125584, 0.0044289506...
Name: hists, Length: 1599, dtype: object

In [7]:
x = np.array([row for row in prepared["hists"].values])
x.shape

(1599, 256)

In [8]:
y = prepared["type"]
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 3, 2, ..., 0, 4, 0])

In [9]:
# number of clusters
num_clusters = len(np.unique(y))
num_clusters

5

In [10]:
cluster_algo = AgglomerativeClustering(n_clusters=num_clusters)
y_pred = cluster_algo.fit_predict(x)

In [11]:
print(f"Accuracy on train set is: {accuracy_score(y, y_pred)}")

Accuracy on train set is: 0.33771106941838647
