# Mini-project: Agglomerative clustering

## <u>Preprocessing</u>

In [None]:
# before executing:
# pip install C

In [1]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from scipy.stats import mode

This algorithm has been implemented using the library sklearn.tree

In [2]:
dataset_ids =   {
                "parkinsons": 174,
                "page-blocks": 78,
                "optical": 80,
                "musk2": 75,
                "bc-wisc-diag": 17,
                "students": 697,
                "wine": 109,
                "magic": 159,
                "balance-scale": 12,
                "glass": 42,
                "zoo": 111,
                "waveform": 107,
                "image-segmentation": 50,
                "blood": 176,
                "spect": 95,
                "yeast": 110,
                "monk": 70,
                "ecoli": 39,
                "iris": 53,
                "contraception": 30,
                "fertility": 244,
                "conn-bench-sonar":  151,
                "landsat": 146,
                "ionosphere": 52,
                "letter": 59,
                }

In [3]:
def load_dataset(id):  
    # fetch dataset 
    dataset = fetch_ucirepo(id=id) 
    
    # data (as pandas dataframes) 
    X = dataset.data.features 
    y =dataset.data.targets 
    
    # dictionary gthering infos about the metadata (url, abstract, ... etc.)
    metadata_infos_dict = dataset.metadata
    print('data url:\n', metadata_infos_dict['data_url'])
    
    # variable information
    var_infos = dataset.variables.to_numpy()
    
    data_vectors = X.to_numpy() #instance vectors with features
    features_names = X.columns.to_numpy() #getting the names of each feature
    
    data_labels = y.to_numpy() #output labels for each instance
    label_name = y.columns.to_numpy() # name of the output label
    
    return data_vectors, features_names, data_labels, label_name

In [4]:
def print_useful_data(X, X_names, y, y_name, index = 0):
    n_instances = len(X)
    n_features = len(X_names)
    
    print("number of instances: ", n_instances)
    print("number of features: ", n_features)

    print("names of the features:\n", X_names)
    print("name of the output label: ", y_name)

    print(f"instance {index} feature vector:\n", X[index])
    print(f"instance {index} output label: ", y[index])

In [5]:
def preprocessing(X, y, split=0.7):
    scaler = StandardScaler()
    try:
        #standardization
        X = scaler.fit_transform(X)
    except ValueError:
        #If non numerical data is detected, data is encoded
        X = np.array(X, dtype=object)
        encoder = OneHotEncoder()
        X_encoded = encoder.fit_transform(X).toarray()
        X = scaler.fit_transform(X_encoded)
    
    #data is split among training and testing sets
    return train_test_split(X, y, train_size=split, random_state=42)

## <u>Accuracy, F1 and confusion matrice</u>

In [6]:
def calculate_metrics(cluster_labels, true_labels, compute_f1_cm = True):
    unique_labels = np.unique(true_labels)
    label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
    int_labels = np.array([label_to_int[label] for label in true_labels.ravel()])

    #map clusters to most frequent true labels
    cluster_to_label = {}
    for cluster in np.unique(cluster_labels):
        mask = cluster_labels == cluster
        if np.any(mask):
            most_common_label, _ = mode(int_labels[mask], axis=None)
            cluster_to_label[cluster] = most_common_label.item()  
        else:
            cluster_to_label[cluster] = -1  #avoid empty clusters

    #predicted labels
    predicted_labels = np.array([cluster_to_label[cluster] for cluster in cluster_labels])

    #compute accuracy
    accuracy = np.mean(predicted_labels == int_labels)
    
    f1 = None
    cm = None
    if compute_f1_cm == True: 
        #compute F1 score
        f1 = f1_score(int_labels, predicted_labels, average='macro')
    
        #compute confusion matrix
        cm = confusion_matrix(int_labels, predicted_labels)

    return accuracy, f1, cm

## <u>Results on 25 datasets</u>

### Predict Students' Dropout and Academic Success

In [16]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["students"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split=split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')
print(y.shape)

data url:
 https://archive.ics.uci.edu/static/public/697/data.csv
training set size: (3096, 36)
testing set size: (1328, 36)
(4424, 1)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [17]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 49.93%
F1 score: 0.22
confusion matrix:
[[   0    0 1421]
 [   0    0  794]
 [   0    0 2209]]


### Wine

In [18]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["wine"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/109/data.csv
training set size: (124, 13)
testing set size: (54, 13)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [19]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 69.66%
F1 score: 0.69
confusion matrix:
[[46  0 13]
 [ 2 51 18]
 [ 0 21 27]]


### MAGIC gamma telescope

In [20]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["magic"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/159/data.csv
training set size: (13314, 10)
testing set size: (5706, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [21]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 66.61%
F1 score: 0.58
confusion matrix:
[[10611  1721]
 [ 4629  2059]]


### Parkinsons

In [22]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["parkinsons"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/174/data.csv
training set size: (136, 22)
testing set size: (59, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [23]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 75.38%
F1 score: 0.43
confusion matrix:
[[  0  48]
 [  0 147]]


### Page-blocks

In [24]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["page-blocks"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/78/data.csv
training set size: (3831, 10)
testing set size: (1642, 10)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [25]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 90.02%
F1 score: 0.30
confusion matrix:
[[4911    0    2    0    0]
 [ 329    0    0    0    0]
 [  16    0   12    0    0]
 [  88    0    0    0    0]
 [ 104    0    7    0    4]]


### Optical

In [40]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["optical"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)

num_sample = 6000
X_train, X_test = X_train[:num_sample*2,:], X_test[:num_sample,:]
Y_train, Y_test = Y_train[:num_sample*2], Y_test[:num_sample]

print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/80/data.csv
training set size: (3933, 64)
testing set size: (1687, 64)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [41]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 79.89%
F1 score: 0.76
confusion matrix:
[[553   0   0   0   0   0   1   0   0   0]
 [  0 194  66   1   2   0   0   0 308   0]
 [  0   0 499   0   0   0   1   0  57   0]
 [  0   0   6 528  20   1   0   2  15   0]
 [  0  10   0   0 553   0   4   1   0   0]
 [  0   0   3  32  16 506   0   0   1   0]
 [  0   0   0   0   0   2 554   0   2   0]
 [  0   2   0   1  11   0   0 552   0   0]
 [  0   1   0   0   1   0   0   1 551   0]
 [  0   0   0 366 171   4   0  11  10   0]]


### Ionosphere

In [28]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ionosphere"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/52/data.csv
training set size: (245, 34)
testing set size: (106, 34)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [29]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 71.79%
F1 score: 0.71
confusion matrix:
[[ 92  34]
 [ 65 160]]


### Glass Identification

In [30]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["glass"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/42/data.csv
training set size: (149, 9)
testing set size: (65, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [31]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 54.21%
F1 score: 0.40
confusion matrix:
[[16 54  0  0  0  0]
 [ 4 67  0  5  0  0]
 [ 3 14  0  0  0  0]
 [ 2  0  0 11  0  0]
 [ 4  0  0  3  0  2]
 [ 3  1  0  3  0 22]]


### Letter Recognition

In [32]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["letter"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/59/data.csv
training set size: (14000, 16)
testing set size: (6000, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [33]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 29.87%
F1 score: 0.24
confusion matrix:
[[623   4   0   0   0   0   0   0   0   0   0   0  24   0   0   0 138   0
    0   0   0   0   0   0   0   0]
 [  0 136   0   0   0   0   0   0   0 112   0   0  43   0   0  15 296   0
    0   0  12   0   0   0   0 152]
 [  0   0 202   0   0   0   0   0   0  51   0   0  16   0  26  71 205   0
    0   0 165   0   0   0   0   0]
 [  3 126   0   0   0   0   0   0   0 125   0   0  12   0 286  17 209   0
    0   0  27   0   0   0   0   0]
 [  0  38 159   0   0   0   0   0   0 102   0   0  27   0   0 145 218   0
    0   0  21   0   0   0   0  58]
 [  0   7   0   0   0 228   0   0   0   0   0   0  12   0   1 198  82   0
    0  76   1 170   0   0   0   0]
 [  0   2 168   0   0   0   0   0   0   0   0   0  38   0 201  35 301   0
    0   0  28   0   0   0   0   0]
 [  0 106   0   0   0   0   0   0   0  85   0   0  47 160  63  39 152   0
    0   0  68   0  10   0   0   4]
 [  0  16   0   0   0   0   0   0   0 233   0 203   0   0   0  34 224   0
    

### Connectionist Bench (Sonar, Mines vs. Rocks)

In [34]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["conn-bench-sonar"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/151/data.csv
training set size: (145, 60)
testing set size: (63, 60)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [35]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 53.37%
F1 score: 0.35
confusion matrix:
[[111   0]
 [ 97   0]]


### Musk (Version 2)

In [36]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["musk2"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/75/data.csv
training set size: (4618, 166)
testing set size: (1980, 166)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [37]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 84.59%
F1 score: 0.46
confusion matrix:
[[5581    0]
 [1017    0]]


### Breast Cancer Wisconsin (Diagnostic)

In [42]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["bc-wisc-diag"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/17/data.csv
training set size: (398, 30)
testing set size: (171, 30)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [43]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 77.86%
F1 score: 0.71
confusion matrix:
[[357   0]
 [126  86]]


### Balance-scale

In [44]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["balance-scale"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/12/data.csv
training set size: (437, 4)
testing set size: (188, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [46]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 68.48%
F1 score: 0.47
confusion matrix:
[[  0  15  34]
 [  0 175 113]
 [  0  35 253]]


### Contraception

In [47]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["contraception"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/30/data.csv
training set size: (1031, 9)
testing set size: (442, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [48]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 44.40%
F1 score: 0.33
confusion matrix:
[[393   0 236]
 [168   0 165]
 [250   0 261]]


### Fertility

In [49]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["fertility"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/244/data.csv
training set size: (70, 9)
testing set size: (30, 9)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [50]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 88.00%
F1 score: 0.47
confusion matrix:
[[88  0]
 [12  0]]


### Waveform (Version 1)

In [51]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["waveform"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/107/data.csv
training set size: (3500, 21)
testing set size: (1500, 21)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [52]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 60.04%
F1 score: 0.48
confusion matrix:
[[   0 1225  432]
 [   0 1344  303]
 [   0   38 1658]]


### Image Segmentation

In [53]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["image-segmentation"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/50/data.csv
training set size: (147, 19)
testing set size: (63, 19)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [54]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 57.62%
F1 score: 0.55
confusion matrix:
[[27  0  0  0  0  0  3]
 [ 4 13  0  0 13  0  0]
 [24  2  1  0  0  0  3]
 [ 0  0  0 16 14  0  0]
 [ 0 10  0  4 16  0  0]
 [ 0  0  0  0  0 30  0]
 [12  0  0  0  0  0 18]]


### Blood Transfusion Service

In [55]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["blood"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/176/data.csv
training set size: (523, 4)
testing set size: (225, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [56]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 76.20%
F1 score: 0.43
confusion matrix:
[[570   0]
 [178   0]]


### SPECT Heart

In [57]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["spect"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/95/data.csv
training set size: (186, 22)
testing set size: (81, 22)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [58]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 79.40%
F1 score: 0.44
confusion matrix:
[[  0  55]
 [  0 212]]


### Statlog (Landsat Satellite)

In [59]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["landsat"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/146/data.csv
training set size: (4504, 36)
testing set size: (1931, 36)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [60]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 67.15%
F1 score: 0.52
confusion matrix:
[[1270    3   40    0    0  220]
 [  31  644    0    0    0   28]
 [  96    0 1262    0    0    0]
 [ 353    1  184    0    0   88]
 [ 168    7    0    0    0  532]
 [ 315    0   48    0    0 1145]]


### Yeast

In [61]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["yeast"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/110/data.csv
training set size: (1038, 8)
testing set size: (446, 8)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [62]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 48.58%
F1 score: 0.36
confusion matrix:
[[342   4   0   7   0  36  31  42   1   0]
 [  0   5   0   0   0   0   0   0   0   0]
 [  5   0   0  27   0   1   2   0   0   0]
 [  1   0   0  41   0   0   2   0   0   0]
 [ 11   1   0  34   0   2   2   1   0   0]
 [ 64   1   0   3   0  84   6   5   0   0]
 [101   0   0  10   0   8 121   1   3   0]
 [246   3   0   4   0  29  30 117   0   0]
 [  6   0   0   2   0   0   1   0  11   0]
 [ 16   0   0   8   0   4   2   0   0   0]]


### Monk

In [63]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["monk"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/70/data.csv
training set size: (302, 6)
testing set size: (130, 6)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [64]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 66.67%
F1 score: 0.67
confusion matrix:
[[144  72]
 [ 72 144]]


### Ecoli

In [65]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["ecoli"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/39/data.csv
training set size: (235, 7)
testing set size: (101, 7)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [66]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 82.74%
F1 score: 0.58
confusion matrix:
[[133   0   0   0   0   0   0  10]
 [  2  47   0   0  27   0   1   0]
 [  0   0   0   0   0   0   2   0]
 [  0   0   0   0   1   0   0   1]
 [  0   3   0   0  31   0   1   0]
 [  0   0   0   0   0  17   1   2]
 [  0   0   0   0   0   0   5   0]
 [  3   0   0   0   1   3   0  45]]


### Iris

In [67]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["iris"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/53/data.csv
training set size: (105, 4)
testing set size: (45, 4)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [68]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 89.33%
F1 score: 0.89
confusion matrix:
[[50  0  0]
 [ 0 49  1]
 [ 0 15 35]]


### Zoo

In [69]:
X, X_names, y, y_name = load_dataset(id = dataset_ids["zoo"])
split_size = 0.7
X_train, X_test, Y_train, Y_test = preprocessing(X, y, split_size)
print(f'training set size: {X_train.shape}')
print(f'testing set size: {X_test.shape}')

data url:
 https://archive.ics.uci.edu/static/public/111/data.csv
training set size: (70, 16)
testing set size: (31, 16)


In [None]:
print_useful_data(X, X_names, y, y_name)

In [70]:
#model training
model = AgglomerativeClustering(n_clusters=len(np.unique(y)))
model.fit(X_train)

#labels prediction
cluster_labels = model.fit_predict(X)

accuracy, f1, cm = calculate_metrics(cluster_labels, y)
print(f"accuracy: {accuracy * 100:.2f}%")
print(f"F1 score: {f1:.2f}")
print(f"confusion matrix:\n{cm}")

accuracy: 85.15%
F1 score: 0.68
confusion matrix:
[[37  0  0  3  1  0  0]
 [ 0 20  0  0  0  0  0]
 [ 0  0  0  0  2  0  3]
 [ 0  0  0 13  0  0  0]
 [ 0  0  0  0  4  0  0]
 [ 0  0  0  0  0  8  0]
 [ 0  0  0  0  0  6  4]]
