# Imports

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import Isomap, LocallyLinearEmbedding, SpectralEmbedding, TSNE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    silhouette_score,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn_extra.cluster import KMedoids


# Datasets
Train, Valid, Test

In [2]:
train_df = pd.read_csv('../generated dataset/train.csv')
valid_df = pd.read_csv('../generated dataset/valid.csv')
test_df = pd.read_csv('../generated dataset/test.csv')

In [3]:
train_df.head(5)

Unnamed: 0,Brightness,Contrast,Mean,Variance,Skewness,Kurtosis,Entropy,Energy,Absolute Moment k=1,Absolute Moment k=2,...,Information Measure I,Information Measure II,Maximal Correlation Coefficient,Short-run Emphasis,Long-run Emphasis,Gray-level Nonuniformity,Difference of Entropy,Second Largest Eigenvalue,Label,Image
0,146.146736,81.409801,146.146736,6627.555637,-0.186254,-1.79875,6.974621,0.011098,146.146736,117.858576,...,1.0,0.0,0.010016,1.0,0.0,6e-06,4.469887,0.015243,Hibiscus,IMG_20241029_131633_jpg.rf.a3f2d7fa633ae0536e8...
1,147.130032,67.900449,147.130032,4610.471012,-0.399389,-1.412432,7.17911,0.01007,147.130032,117.151824,...,1.0,0.0,0.014951,1.0,0.0,7e-06,4.732864,0.020229,Hibiscus,IMG_20241029_131640_jpg.rf.30a12f9b36a51bb91c4...
2,148.789408,73.502056,148.789408,5402.552291,-0.262159,-1.708594,6.882111,0.011922,148.789408,114.736928,...,1.0,0.0,0.012129,1.0,0.0,7e-06,3.966379,0.018935,Hibiscus,IMG_20241029_131646_jpg.rf.c743c72ae1c3b403d16...
3,131.630976,76.952406,131.630976,5921.672749,-0.136606,-1.796559,6.932319,0.010645,131.630976,116.349472,...,1.0,0.0,0.008145,1.0,0.0,5e-06,4.478143,0.012706,Hibiscus,IMG_20241029_131651_jpg.rf.203f579e95e3f696a72...
4,150.084368,60.223102,150.084368,3626.821986,-0.411602,-1.39163,6.861676,0.013588,150.084368,116.505328,...,1.0,0.0,0.017838,1.0,0.0,9e-06,4.438945,0.019402,Hibiscus,IMG_20241029_131711_jpg.rf.c3aee7f34bec54a8830...


In [4]:
X_train, y_train = train_df.drop(columns=['Image', 'Label']), train_df['Label']
X_valid, y_valid = valid_df.drop(columns=['Image', 'Label']), valid_df['Label']
X_test, y_test = test_df.drop(columns=['Image', 'Label']), test_df['Label']

## Scaling to Normalize the Data

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

Given a model it will evaluate with the help of the earlier dataset and store the best model on the basis of F1-Score.

In [6]:
best_model = None
best_model_name = None
best_model_score = float('-inf')  

In [7]:
def evaluate_model(name, model, X_train, y_train, X_valid, y_valid, X_test, y_test):
    global best_model, best_model_name, best_model_score

    print(f"\nTraining {name} model...")
    model.fit(X_train, y_train)

    # Validation performance
    y_valid_pred = model.predict(X_valid)
    print(f"\nValidation Performance of {name}:")
    print(confusion_matrix(y_valid, y_valid_pred))
    
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_precision = precision_score(y_valid, y_valid_pred, average='weighted', zero_division=0)
    valid_recall = recall_score(y_valid, y_valid_pred, average='weighted', zero_division=0)
    valid_f1_score = f1_score(y_valid, y_valid_pred, average='weighted', zero_division=0)
    
    print(f'The weighted average of accuracy for validation dataset is: {valid_accuracy:.4f}')
    print(f'The weighted average of precision for validation dataset is: {valid_precision:.4f}')
    print(f'The weighted average of recall for validation dataset is: {valid_recall:.4f}')
    print(f'The weighted average of F1-score for validation dataset is: {valid_f1_score:.4f}')
    
    # Test performance
    y_test_pred = model.predict(X_test)
    print(f"\nTest Performance of {name}:")
    print(confusion_matrix(y_test, y_test_pred))

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
    test_f1_score = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
    
    print(f'The weighted average of accuracy for test dataset is: {test_accuracy:.4f}')
    print(f'The weighted average of precision for test dataset is: {test_precision:.4f}')
    print(f'The weighted average of recall for test dataset is: {test_recall:.4f}')
    print(f'The weighted average of F1-score for test dataset is: {test_f1_score:.4f}')
    
    # Update the best model if the current model is better (using test F1 score)
    if test_f1_score > best_model_score:
        best_model_score = test_f1_score
        best_model = model
        best_model_name = name
        print(f"\n{name} is the new best model with test F1 score: {test_f1_score:.4f}")

# Classification Algorithms

## 1. Linear SVM

In [8]:
svm_linear = SVC(kernel='linear', C=1, random_state=42)
evaluate_model("Linear SVM", svm_linear, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Linear SVM model...

Validation Performance of Linear SVM:
[[ 1  0  0  0  0  0  0  0  0  1]
 [ 0  5  0  0  0  1  0  1  2  0]
 [ 0  0  4  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  1  0  1]
 [ 0  0  0  0  3  0  0  0  1  0]
 [ 0  0  0  0  1  1  0  1  0  0]
 [ 1  1  0  0  1  0  3  1  0  1]
 [ 0  0  0  0  0  0  0  3  0  0]
 [ 1  0  0  0  0  1  0  0 59  0]
 [ 0  0  3  0  0  4  0  0  0  0]]
The weighted average of accuracy for validation dataset is: 0.7596
The weighted average of precision for validation dataset is: 0.7718
The weighted average of recall for validation dataset is: 0.7596
The weighted average of F1-score for validation dataset is: 0.7463

Test Performance of Linear SVM:
[[ 0  0  0  0  0  0  1  0  0]
 [ 0  1  0  0  2  0  0  0  0]
 [ 0  0  2  0  2  0  0  0  1]
 [ 0  0  0  1  0  0  0  0  0]
 [ 1  2  1  0  2  0  0  0  3]
 [ 0  0  1  0  0  2  0  1  0]
 [ 0  1  0  0  0  0  1  1  0]
 [ 0  0  0  0  1  0  0  1  0]
 [ 0  0  0  0  0  0  0  0 10]]
The weighted average of accur

## 2. Polynomial SVM

In [9]:
svm_poly = SVC(kernel='poly', degree=3, C=1, random_state=42)
evaluate_model("Polynomial SVM", svm_poly, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Polynomial SVM model...

Validation Performance of Polynomial SVM:
[[ 0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  9  0]
 [ 0  0  1  0  0  0  0  0  4  0]
 [ 0  1  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  2  0  0  0  2  0]
 [ 0  0  0  0  1  0  0  0  2  0]
 [ 0  0  0  0  2  0  2  0  4  0]
 [ 0  0  0  0  0  0  0  1  2  0]
 [ 0  0  0  0  0  0  0  0 61  0]
 [ 0  0  1  0  0  0  0  0  6  0]]
The weighted average of accuracy for validation dataset is: 0.6442
The weighted average of precision for validation dataset is: 0.5299
The weighted average of recall for validation dataset is: 0.6442
The weighted average of F1-score for validation dataset is: 0.5407

Test Performance of Polynomial SVM:
[[ 0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  2  0  0  0  1]
 [ 0  0  1  0  0  0  0  0  4]
 [ 0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  4  0  0  0  5]
 [ 0  0  0  0  0  0  0  0  4]
 [ 0  1  0  0  0  0  0  0  2]
 [ 0  0  0  0  1  0  0  0  1]
 [ 0  0  0  0  0  0  0  0 10]]
The weighted aver

## 3. RBF SVM

In [10]:
svm_rbf = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
evaluate_model("RBF SVM", svm_rbf, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training RBF SVM model...

Validation Performance of RBF SVM:
[[ 0  1  0  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  8  0]
 [ 0  0  3  0  0  0  0  0  1  1]
 [ 0  1  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  1  2  0  0  0  0]
 [ 1  0  0  0  3  0  2  0  2  0]
 [ 0  0  0  0  0  1  0  2  0  0]
 [ 0  0  0  0  0  0  0  0 60  1]
 [ 0  0  5  0  0  1  0  0  1  0]]
The weighted average of accuracy for validation dataset is: 0.7115
The weighted average of precision for validation dataset is: 0.6662
The weighted average of recall for validation dataset is: 0.7115
The weighted average of F1-score for validation dataset is: 0.6559

Test Performance of RBF SVM:
[[0 0 0 0 0 0 1 0 0]
 [0 1 0 0 2 0 0 0 0]
 [0 0 3 0 0 1 0 0 1]
 [0 0 0 1 0 0 0 0 0]
 [0 1 0 0 4 1 0 0 3]
 [0 0 1 0 0 2 0 0 1]
 [1 1 0 0 1 0 0 0 0]
 [0 0 0 0 2 0 0 0 0]
 [0 0 0 0 1 0 0 1 8]]
The weighted average of accuracy for test dataset is: 0.5000
The weighted average of precision for test dataset is: 0.4

## 4. Logistic Regression

In [11]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
evaluate_model("Logistic Regression", log_reg, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Logistic Regression model...

Validation Performance of Logistic Regression:
[[ 1  0  0  0  0  0  0  0  0  1]
 [ 0  5  0  0  0  0  0  1  2  1]
 [ 0  0  3  0  0  0  0  0  1  1]
 [ 0  0  0  0  0  0  0  1  1  0]
 [ 0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  1  2  0  0  0  0]
 [ 2  0  0  0  2  0  3  1  0  0]
 [ 0  0  0  0  1  1  0  1  0  0]
 [ 0  0  0  0  0  1  0  0 60  0]
 [ 0  0  4  0  0  2  0  0  0  1]]
The weighted average of accuracy for validation dataset is: 0.7692
The weighted average of precision for validation dataset is: 0.7932
The weighted average of recall for validation dataset is: 0.7692
The weighted average of F1-score for validation dataset is: 0.7575

Test Performance of Logistic Regression:
[[0 0 0 0 0 0 1 0 0]
 [0 1 0 0 2 0 0 0 0]
 [0 0 3 0 0 1 0 0 1]
 [0 0 0 1 0 0 0 0 0]
 [0 2 0 0 5 1 0 0 1]
 [1 0 1 0 0 1 0 1 0]
 [0 0 0 0 0 0 1 2 0]
 [0 0 0 0 1 0 0 1 0]
 [0 0 0 0 1 0 0 0 9]]
The weighted average of accuracy for test dataset is: 0.5789
The weighted average o

## 5. Parzen Window

In [12]:
parzen_model = GaussianNB()
evaluate_model("Parzen Windows (Gaussian Naive Bayes)", parzen_model, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Parzen Windows (Gaussian Naive Bayes) model...

Validation Performance of Parzen Windows (Gaussian Naive Bayes):
[[ 1  0  0  0  0  1  0  0  0  0]
 [ 2  0  0  0  2  0  1  3  1  0]
 [ 0  0  3  0  1  1  0  0  0  0]
 [ 0  1  0  0  1  0  0  0  0  0]
 [ 0  1  0  0  3  0  0  0  0  0]
 [ 0  0  0  0  1  2  0  0  0  0]
 [ 3  0  0  0  2  0  3  0  0  0]
 [ 0  0  0  0  0  0  1  2  0  0]
 [ 1  0  2  5  1  3  3 33 13  0]
 [ 2  0  2  0  0  2  0  1  0  0]]
The weighted average of accuracy for validation dataset is: 0.2596
The weighted average of precision for validation dataset is: 0.6146
The weighted average of recall for validation dataset is: 0.2596
The weighted average of F1-score for validation dataset is: 0.2875

Test Performance of Parzen Windows (Gaussian Naive Bayes):
[[0 0 0 0 0 0 1 0 0]
 [0 0 1 0 2 0 0 0 0]
 [1 0 1 1 1 0 0 1 0]
 [0 0 0 1 0 0 0 0 0]
 [0 1 0 0 4 1 0 3 0]
 [2 0 0 0 0 2 0 0 0]
 [1 0 0 0 1 0 1 0 0]
 [0 0 0 0 2 0 0 0 0]
 [2 0 0 3 1 1 0 2 1]]
The weighted average of accur

## 6. k-Nearest Neighbor

In [13]:
knn_model = KNeighborsClassifier(n_neighbors=5)
evaluate_model("KNN", knn_model, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training KNN model...

Validation Performance of KNN:
[[ 1  0  0  0  0  0  0  0  1  0]
 [ 1  1  0  0  0  1  0  0  6  0]
 [ 1  0  3  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  1  0  0  1  0]
 [ 1  0  0  0  3  0  0  0  0  0]
 [ 0  1  0  0  0  2  0  0  0  0]
 [ 3  1  0  0  2  0  2  0  0  0]
 [ 0  0  1  0  1  0  0  1  0  0]
 [ 1  1  0  0  0  0  0  2 57  0]
 [ 0  0  3  0  0  1  0  0  1  2]]
The weighted average of accuracy for validation dataset is: 0.6923
The weighted average of precision for validation dataset is: 0.7334
The weighted average of recall for validation dataset is: 0.6923
The weighted average of F1-score for validation dataset is: 0.6741

Test Performance of KNN:
[[0 0 0 0 0 0 1 0 0]
 [0 1 0 0 2 0 0 0 0]
 [0 0 4 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 2 2 0 1 4]
 [1 0 1 0 0 1 0 0 1]
 [0 1 1 0 0 0 0 1 0]
 [0 0 1 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0 9]]
The weighted average of accuracy for test dataset is: 0.4737
The weighted average of precision for test dataset is: 0.4199
The weig

## 7. Decision Tree Classifier

In [14]:
decision_tree = DecisionTreeClassifier(max_depth=5, random_state=42)
evaluate_model("Decision Tree", decision_tree, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Decision Tree model...

Validation Performance of Decision Tree:
[[ 0  0  0  0  1  0  0  0  0  1]
 [ 2  1  0  0  0  0  0  0  6  0]
 [ 0  0  1  2  0  0  0  0  0  2]
 [ 1  0  0  0  0  1  0  0  0  0]
 [ 0  1  0  0  3  0  0  0  0  0]
 [ 0  0  0  0  1  1  0  0  0  1]
 [ 2  0  2  0  3  0  1  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  2]
 [ 0  0  0  1  0  0  0  3 56  1]
 [ 0  1  1  0  0  1  0  0  0  4]]
The weighted average of accuracy for validation dataset is: 0.6442
The weighted average of precision for validation dataset is: 0.6961
The weighted average of recall for validation dataset is: 0.6442
The weighted average of F1-score for validation dataset is: 0.6350

Test Performance of Decision Tree:
[[1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 1]
 [0 0 3 0 2 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [4 0 2 0 2 0 0 0 1 0]
 [2 0 0 1 0 0 0 0 0 1]
 [0 1 0 0 2 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0]
 [2 0 0 0 2 0 0 0 5 1]
 [0 0 0 0 0 0 0 0 0 0]]
The weighted average of accuracy for test dataset is: 0.3421

## 8. Random Forest 

In [15]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
evaluate_model("Random Forest", rf_model, X_train, y_train, X_valid, y_valid, X_test, y_test)


Training Random Forest model...

Validation Performance of Random Forest:
[[ 0  0  0  0  1  0  0  0  0  1]
 [ 1  3  0  0  0  0  0  0  5  0]
 [ 0  0  4  0  0  0  0  1  0  0]
 [ 0  0  0  1  0  0  0  1  0  0]
 [ 0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  1  2  0  0  0  0]
 [ 1  2  0  0  2  0  3  0  0  0]
 [ 0  0  0  0  0  1  0  2  0  0]
 [ 0  0  0  0  0  0  0  1 60  0]
 [ 0  0  2  0  0  1  0  0  0  4]]
The weighted average of accuracy for validation dataset is: 0.7981
The weighted average of precision for validation dataset is: 0.8206
The weighted average of recall for validation dataset is: 0.7981
The weighted average of F1-score for validation dataset is: 0.7869

Test Performance of Random Forest:
[[1 0 0 0 0 0 0 0 0]
 [0 2 0 0 1 0 0 0 0]
 [0 0 3 1 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 2 2 0 1 4]
 [0 0 0 0 0 3 0 1 0]
 [0 1 1 0 1 0 0 0 0]
 [0 0 0 0 2 0 0 0 0]
 [0 0 0 0 1 0 0 1 8]]
The weighted average of accuracy for test dataset is: 0.5263
The weighted average of precision for te

## Printing the Best Model:

In [16]:
print(f"The best model so far is: {best_model_name} with the f1-score of {best_model_score}. ")

The best model so far is: Logistic Regression with the f1-score of 0.5766917293233081. 


# Clustering

In [17]:
X = pd.concat([train_df, test_df, valid_df], axis=0)
X.head(5)

Unnamed: 0,Brightness,Contrast,Mean,Variance,Skewness,Kurtosis,Entropy,Energy,Absolute Moment k=1,Absolute Moment k=2,...,Information Measure I,Information Measure II,Maximal Correlation Coefficient,Short-run Emphasis,Long-run Emphasis,Gray-level Nonuniformity,Difference of Entropy,Second Largest Eigenvalue,Label,Image
0,146.146736,81.409801,146.146736,6627.555637,-0.186254,-1.79875,6.974621,0.011098,146.146736,117.858576,...,1.0,0.0,0.010016,1.0,0.0,6e-06,4.469887,0.015243,Hibiscus,IMG_20241029_131633_jpg.rf.a3f2d7fa633ae0536e8...
1,147.130032,67.900449,147.130032,4610.471012,-0.399389,-1.412432,7.17911,0.01007,147.130032,117.151824,...,1.0,0.0,0.014951,1.0,0.0,7e-06,4.732864,0.020229,Hibiscus,IMG_20241029_131640_jpg.rf.30a12f9b36a51bb91c4...
2,148.789408,73.502056,148.789408,5402.552291,-0.262159,-1.708594,6.882111,0.011922,148.789408,114.736928,...,1.0,0.0,0.012129,1.0,0.0,7e-06,3.966379,0.018935,Hibiscus,IMG_20241029_131646_jpg.rf.c743c72ae1c3b403d16...
3,131.630976,76.952406,131.630976,5921.672749,-0.136606,-1.796559,6.932319,0.010645,131.630976,116.349472,...,1.0,0.0,0.008145,1.0,0.0,5e-06,4.478143,0.012706,Hibiscus,IMG_20241029_131651_jpg.rf.203f579e95e3f696a72...
4,150.084368,60.223102,150.084368,3626.821986,-0.411602,-1.39163,6.861676,0.013588,150.084368,116.505328,...,1.0,0.0,0.017838,1.0,0.0,9e-06,4.438945,0.019402,Hibiscus,IMG_20241029_131711_jpg.rf.c3aee7f34bec54a8830...


In [18]:
X.count()

Brightness                         401
Contrast                           401
Mean                               401
Variance                           401
Skewness                           401
Kurtosis                           401
Entropy                            401
Energy                             401
Absolute Moment k=1                401
Absolute Moment k=2                401
ASM                                401
Contrast (GLCM)                    401
IDF                                401
Entropy (GLCM)                     401
Correlation (GLCM)                 401
Variance (GLCM)                    401
Sum Average                        401
Sum Variance                       401
Sum Entropy                        401
Difference Average                 401
Difference Variance                401
Difference Entropy                 401
Information Measure I              401
Information Measure II             401
Maximal Correlation Coefficient    401
Short-run Emphasis       

In [19]:
X = X.drop(columns=['Image', 'Label'])

In [28]:
global_best = {"method_name": None, "silhouette_score": -1}

def evaluate_clustering(X, labels, method_name):
    global global_best
    
    if -1 in labels:
        X_filtered = X[labels != -1]  # Exclude noise points from data
        labels_filtered = labels[labels != -1]  # Exclude noise points from labels
    else:
        X_filtered = X
        labels_filtered = labels

    # Check if clustering has more than one cluster
    if len(set(labels_filtered)) > 1:
        silhouette_avg = silhouette_score(X_filtered, labels_filtered)
        print(f"{method_name} Silhouette Score: {silhouette_avg}")

        # Update global best if this clustering is better
        if silhouette_avg > global_best["silhouette_score"]:
            global_best["method_name"] = method_name
            global_best["silhouette_score"] = silhouette_avg
    else:
        print(f"{method_name} could not compute silhouette score because all points belong to a single cluster.")

## 1. k-Means

In [29]:
labels_kmeans = KMeans(n_clusters=10, random_state=42).fit_predict(X)
evaluate_clustering(X, labels_kmeans, 'k-Means')

k-Means Silhouette Score: 0.500571932164741




## 2. k-Mediods

In [30]:
labels_kmedoids = KMedoids(n_clusters=10, random_state=42).fit_predict(X)
evaluate_clustering(X, labels_kmedoids, 'k-Means')

k-Means Silhouette Score: 0.4403262382757256


## 3. Isodata

In [31]:
labels_isodata = KMeans(n_clusters=10, init='k-means++', n_init=10, random_state=42).fit_predict(X)
evaluate_clustering(X, labels_isodata, 'ISODATA')

ISODATA Silhouette Score: 0.500571932164741




## 4. DBScan

In [32]:
labels_dbscan = DBSCAN(eps=0.5, min_samples=5).fit_predict(X)
evaluate_clustering(X, labels_dbscan, 'DBScan')

DBScan could not compute silhouette score because all points belong to a single cluster.


## 5. MST

In [33]:
neighbors = NearestNeighbors(n_neighbors=5)
neighbors.fit(X)
distances, indices = neighbors.kneighbors(X)

# Build the minimum spanning tree using the distances
graph = nx.Graph()

for i in range(len(X)):
    for j in indices[i]:
        if i != j:  # Avoid self-loop
            graph.add_edge(i, j, weight=distances[i][np.where(indices[i] == j)[0][0]])

# Find the connected components (clusters) of the MST
mst = nx.minimum_spanning_tree(graph)
clusters_mst = list(nx.connected_components(mst))

mst_labels = np.zeros(len(X))
for cluster_id, cluster in enumerate(clusters_mst):
    for idx in cluster:
        mst_labels[idx] = cluster_id

evaluate_clustering(X, mst_labels, 'MST')

MST could not compute silhouette score because all points belong to a single cluster.


## 6. Directed Tree

In [34]:
labels_tree = AgglomerativeClustering(n_clusters=10, linkage='ward').fit_predict(X)
evaluate_clustering(X, labels_dbscan, 'Directed Tree')

Directed Tree could not compute silhouette score because all points belong to a single cluster.


## Printing the Best

In [27]:
print(global_best)

{'method_name': 'k-Means', 'silhouette_score': 0.500571932164741, 'labels': array([6, 7, 0, 8, 3, 0, 0, 3, 3, 3, 7, 0, 0, 0, 6, 6, 4, 5, 2, 0, 2, 2,
       6, 6, 9, 0, 6, 8, 8, 8, 7, 2, 2, 0, 2, 2, 2, 6, 8, 8, 2, 2, 0, 2,
       8, 6, 8, 8, 8, 7, 0, 3, 2, 2, 0, 6, 6, 8, 8, 6, 2, 8, 8, 8, 6, 2,
       6, 8, 2, 2, 2, 2, 6, 6, 2, 8, 6, 2, 2, 2, 6, 8, 6, 8, 6, 2, 8, 8,
       6, 8, 8, 8, 0, 0, 7, 7, 6, 7, 3, 6, 7, 7, 7, 7, 0, 0, 0, 7, 8, 0,
       8, 0, 0, 3, 3, 5, 3, 3, 7, 7, 3, 7, 3, 5, 7, 5, 4, 3, 7, 7, 9, 3,
       1, 3, 7, 0, 5, 3, 1, 5, 9, 9, 4, 1, 3, 4, 4, 4, 9, 2, 0, 0, 3, 3,
       0, 3, 0, 6, 6, 3, 3, 6, 6, 6, 2, 6, 2, 2, 8, 8, 3, 2, 6, 6, 7, 7,
       0, 7, 5, 1, 6, 7, 5, 9, 1, 4, 5, 9, 1, 1, 4, 9, 6, 0, 5, 5, 6, 2,
       6, 2, 2, 2, 0, 8, 5, 8, 3, 7, 0, 0, 0, 6, 9, 8, 8, 8, 6, 3, 0, 5,
       1, 6, 0, 7, 0, 0, 7, 3, 5, 7, 3, 0, 0, 7, 3, 3, 3, 3, 9, 3, 3, 3,
       6, 6, 6, 8, 0, 5, 3, 0, 3, 6, 6, 0, 7, 6, 5, 0, 8, 3, 3, 0, 6, 8,
       2, 8, 2, 2, 0, 0, 8, 7, 8, 6, 0, 0, 8, 6,