<a href="https://colab.research.google.com/github/RuixMao/S-GCD/blob/main/TrialsForGCD_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Salinas

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['salinasA_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['salinasA_gt']

# https://www.ehu.eus/ccwintco/index.php/Hyperspectral_Remote_Sensing_Scenes
data = load_data('/content/SalinasA_corrected.mat')
gt_data = load_gt('/content/SalinasA_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_classes = [10, 11, 12]
unknown_classes = [1, 13, 14]
 # we need all classes without 0 class (because 0 is not class label)
all_classes = np.concatenate((known_classes,unknown_classes), axis=0)

all_data = flattened_data[np.isin(flattened_gt, all_classes)]
all_labels = flattened_gt[np.isin(flattened_gt, all_classes)]

known_data = all_data[np.isin(all_labels, known_classes)]
known_labels = all_labels[np.isin(all_labels, known_classes)]
unknown_data = all_data[np.isin(all_labels, unknown_classes)]
unknown_labels = all_labels[np.isin(all_labels, unknown_classes)]

# PCA: before OR after the splitting?
# In this case: after the splitting

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled,unknown_data), axis=0)
y_test = np.concatenate((y_test_labeled,unknown_labels), axis=0)

trainN=len(y_train)
testN1=len(y_test_labeled)
testN2=len(unknown_labels)

print(min_value)
print(max_value)
unique, counts = np.unique(flattened_gt, return_counts=True)
print(np.asarray((unique, counts)).T)
print("   ")
print(data.shape)
print(data.shape[0]," x ", data.shape[1], " = ", data.shape[0]*data.shape[1])
print(known_data.shape)
print(known_labels.shape)
print("  ")
print(X_train_origin.shape)
print(X_test_origin.shape)
print(trainN)
print(testN1)
print(testN2)

-9
8373
[[   0 1790]
 [   1  391]
 [  10 1343]
 [  11  616]
 [  12 1525]
 [  13  674]
 [  14  799]]
   
(83, 86, 204)
83  x  86  =  7138
(3484, 204)
(3484,)
  
(2787, 204)
(2561, 204)
2787
697
1864


In [None]:
pca = PCA(n_components=30)  # are the results worse without PCA?
pca.fit(X_train_origin)
X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)  # Transform all data using PCA

print(X_train.shape)
print(X_test.shape)

(2787, 30)
(2561, 30)


In [None]:
# Spectral Clustering with 6 clusters (adjust number of clusters if needed)
clustering = SpectralClustering(n_clusters=6, assign_labels='kmeans', random_state=42)
# clustering = KMeans(n_clusters=6, random_state=42)
all_data_pca = np.vstack([X_train, X_test])
all_cluster_labels = clustering.fit_predict(all_data_pca)

unique, counts = np.unique(all_cluster_labels, return_counts=True)
print(unique, counts)

[0 1 2 3 4 5] [ 601 1360  390 1579  883  535]


In [None]:
# Assign all_cluster_labels with class_labels in X_train
import statistics
from statistics import mode

all_cluster_labels_train = all_cluster_labels[:trainN]
all_cluster_labels_test = all_cluster_labels[trainN:]

all_cluster_labels_a = all_cluster_labels_train[np.isin(y_train, known_classes[0])]
all_cluster_labels_b = all_cluster_labels_train[np.isin(y_train, known_classes[1])]
all_cluster_labels_c = all_cluster_labels_train[np.isin(y_train, known_classes[2])]
print(len(all_cluster_labels_a))
print(len(all_cluster_labels_b))
print(len(all_cluster_labels_c))

# print(statistics.mode(all_cluster_labels_a))
# print(statistics.mode(all_cluster_labels_b))
# print(statistics.mode(all_cluster_labels_c))

# Statistics of Class1, Class10, Class11 with different clusters:
# print(np.unique(all_cluster_labels_a, return_counts=True))
# print(np.unique(all_cluster_labels_b, return_counts=True))
# print(np.unique(all_cluster_labels_c, return_counts=True))

AssignMatrix=np.zeros((3, 6))
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)
print(AssignMatrix)

# Instead of counts the relative ratio is better:
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)/len(all_cluster_labels_a)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)/len(all_cluster_labels_b)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)/len(all_cluster_labels_c)
print(AssignMatrix)

1085
484
1218
[[   2.  633.    0.   13.    8.  429.]
 [   0.  459.    0.   25.    0.    0.]
 [   0.    0.    0. 1218.    0.    0.]]
[[0.00184332 0.58341014 0.         0.01198157 0.00737327 0.39539171]
 [0.         0.94834711 0.         0.05165289 0.         0.        ]
 [0.         0.         0.         1.         0.         0.        ]]


In [None]:
# Hungarian method for best (maximized) assignment
from scipy.optimize import linear_sum_assignment
row_ind, col_ind = linear_sum_assignment(AssignMatrix, maximize='true')
print(col_ind)

# col_ind[0] is for class1, col_ind[1] is for class10, col_ind[2] is for class11

[5 1 3]


In [None]:
# 1. method
# Using only clustering and assigment between the known classes and clusters
# inference: use the cluster ID for classification and clustering
# instance in X_test is predicted based on the instance's cluster ID
# E.g.:
# if 0 then this belongs to Cluster0
# if 1 then this belongs to Class1
# if 2 then this belongs to Class11
# if 3 then this belongs to Cluster3
# if 4 then this belongs to Cluster4
# if 5 then this belongs to Class10

i = 0
while i < len(all_cluster_labels):
  if all_cluster_labels[i] == col_ind[0]:
    all_cluster_labels[i] = known_classes[0]
  if all_cluster_labels[i] == col_ind[1]:
    all_cluster_labels[i] = known_classes[1]
  if all_cluster_labels[i] == col_ind[2]:
    all_cluster_labels[i] = known_classes[2]
  i += 1

y_test_labeled_Predicted=all_cluster_labels[trainN:trainN+testN1]
unknown_labels_Predicted=all_cluster_labels[trainN+testN1:trainN+testN1+testN2]
print(len(y_test_labeled_Predicted))
print(len(unknown_labels_Predicted))

697
1864


In [None]:
# 2. method
# Use an extended classifier, where the rest of clusters added as dummy classes
# a. Training a classifier with:
#    Train set with ground true labels + Subset of Test set (belonging to rest clusters)
# b. Test phase of the classifier, where inference is the same as in 1. method

rest_clusters=list(range(len(all_classes)))
for i in range (len(col_ind)):
  rest_clusters.remove(col_ind[i])
print(rest_clusters)

X_test_subset=X_test[np.isin(all_cluster_labels_test, rest_clusters)]
y_test_subset=y_test[np.isin(all_cluster_labels_test, rest_clusters)]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

[0, 2, 4]
697
1864


In [None]:
# ----------------------------------------------------------------
# Evaluation of the methods
# metrics from https://www.geeksforgeeks.org/clustering-performance-evaluation-in-scikit-learn/

from sklearn.metrics import adjusted_rand_score, rand_score, fowlkes_mallows_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score

print("Results of the 1. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted))
print(" ")
print("Results of the 2. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted2))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted2))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted2))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted2))


Results of the 1. method
accuracy:  0.7618364418938307
rand index:  0.8533481232678845
adjusted rand index:  0.6824048749840209
fowlkes mallows :  0.7972981609048793
 
Results of the 2. method
accuracy:  0.975609756097561
rand index:  0.9878720232952988
adjusted rand index:  0.9735827585422115
fowlkes mallows :  0.9830182889766578


In [None]:
# 3. method
# Let us calculate the centroids of the classes in X_train

# 3.1 method: Let us calculate the centroids of the rest clusters
# 3.2 method: Let us calculate the centroids of the rest clusters only with X_test
# (so at 3.2. we should remove the X_train instances in the rest clusters)

# Centroids for known classes (you've already done this)
X_train_a_mean = np.mean(X_train[np.isin(y_train, known_classes[0])], axis=0)
X_train_b_mean = np.mean(X_train[np.isin(y_train, known_classes[1])], axis=0)
X_train_c_mean = np.mean(X_train[np.isin(y_train, known_classes[2])], axis=0)

# 3.1 method: Calculate the centre of mass of rest clusters using combined data
rest_clusters_centroids = []

# Create a boolean mask for trainN+testN1 based on the shape of all_cluster_labels
# First, filter out the required data based on trainN+testN1
relevant_data_combined = all_data_combined[:trainN+testN1]

for cluster in rest_clusters:
    mask_cluster = (all_cluster_labels[:trainN+testN1] == cluster)
    rest_cluster_data = relevant_data_combined[mask_cluster]
    if rest_cluster_data.size > 0:
        rest_cluster_mean = np.mean(rest_cluster_data, axis=0)
        rest_clusters_centroids.append(rest_cluster_mean)



# 3.2 method: Only use X_test data to calculate center of mass
# Assume all_cluster_labels_test is the label set for X_test
rest_clusters_centroids_test_only = []
for cluster in rest_clusters:
    rest_cluster_data = X_test[all_cluster_labels_test == cluster]
    if rest_cluster_data.size > 0:
        rest_cluster_mean = np.mean(rest_cluster_data, axis=0)
        rest_clusters_centroids_test_only.append(rest_cluster_mean)

# Distance calculation for each instance in X_test to centroids and decide the nearest one
all_centroids_3_1 = [X_train_a_mean, X_train_b_mean, X_train_c_mean] + rest_clusters_centroids
all_centroids_3_2 = [X_train_a_mean, X_train_b_mean, X_train_c_mean] + rest_clusters_centroids_test_only

def nearest_centroid_indices(X, centroids):
    nearest = []
    for instance in X:
        distances = [np.linalg.norm(instance - centroid) for centroid in centroids]
        nearest.append(np.argmin(distances))
    return nearest

nearest_to_3_1 = nearest_centroid_indices(X_test, all_centroids_3_1)
nearest_to_3_2 = nearest_centroid_indices(X_test, all_centroids_3_2)

print(nearest_to_3_1)  # Print the closest centre-of-mass index for each test instance using method 3.1
print(nearest_to_3_2)  # Print the closest centre-of-mass index for each test instance using method 3.2

# Alternative solution: X_train_a_mean = [float(sum(i))/len(i) for i in zip(*X_train_a)]

print(X_train_a.shape)
print(X_train_b.shape)
print(X_train_c.shape)

[0, 1, 2, 2, 2, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 4, 2, 1, 1, 1, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 2, 1, 0, 1, 2, 2, 4, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 1, 2, 1, 1, 2, 1, 0, 0, 2, 4, 2, 1, 2, 1, 2, 2, 0, 2, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 2, 1, 2, 2, 2, 2, 0, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 0, 1, 2, 0, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 0, 2, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 2, 2, 2, 1, 0, 2, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 0, 4, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 0, 1, 2, 2, 1, 0, 1, 0, 2, 2, 1, 1, 2, 4, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 0, 2, 2, 1, 

In [None]:
from sklearn.metrics import adjusted_rand_score, rand_score, fowlkes_mallows_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score

print("Results of the 3. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted))


Results of the 3. method
accuracy:  0.7618364418938307
rand index:  0.8533481232678845
adjusted rand index:  0.6824048749840209
fowlkes mallows :  0.7972981609048793


### 4.method

In [None]:
!git clone https://github.com/sgvaze/generalized-category-discovery.git

Cloning into 'generalized-category-discovery'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 76 (delta 13), reused 53 (delta 7), pack-reused 0[K
Receiving objects: 100% (76/76), 3.70 MiB | 8.56 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [None]:
!mv generalized-category-discovery gcd

In [None]:
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# 4. method
# semi-supervised k-means clustering
import numpy as np
import scipy.io
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import normalized_mutual_info_score as nmi_score, accuracy_score
from sklearn.model_selection import train_test_split
from statistics import mode



# Assuming you've already imported other necessary modules like clustering, etc.

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['salinasA_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['salinasA_gt']

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(labeled_data)
labeled_data_pca = pca.transform(labeled_data)
unlabeled_data_pca = pca.transform(unlabeled_data)
min_length = min(len(unlabeled_data_pca), len(labeled_data_pca))
unlabeled_data_pca = unlabeled_data_pca[:min_length]
labeled_data_pca = labeled_data_pca[:min_length]

# PCA transformation
unlabeled_data_pca = pca.transform(X_test_origin[len(y_test_labeled):])
labeled_data_pca = X_train
y_train_true = y_train

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
unlabeled_data_pca = torch.from_numpy(unlabeled_data_pca).to(device)
labeled_data_pca = torch.from_numpy(labeled_data_pca).to(device)
y_train_true = torch.tensor(y_train_true).to(device)

# Assuming you have an implementation for SemiSupKMeans:
semi_sup_kmeans = SemiSupKMeans(k=8, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit_mix(unlabeled_data_pca, labeled_data_pca, y_train_true)

# Get cluster labels for training data
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()

# Map cluster labels back to original labels
y_train_pred = np.empty_like(y_train_true.cpu().numpy())
y_test_pred = np.empty_like(y_test_true)

unique_clusters = np.unique(cluster_labels)
for cluster in unique_clusters:
    mask_train = cluster_labels[:len(y_train_true)] == cluster
    mask_test = cluster_labels[len(y_train_true):] == cluster
    common_label = mode(y_train_true.cpu().numpy()[mask_train]) if mask_train.sum() > 0 else -1
    y_train_pred[mask_train] = common_label
    y_test_pred[mask_test] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true.cpu().numpy(), y_train_pred)
test_accuracy = accuracy_score(y_test_true, y_test_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')
print(f'Testing accuracy: {test_accuracy * 100:.2f}%')

# Combine true labels of training and testing sets
y_true_combined = np.concatenate((y_train_true.cpu().numpy(), y_test_true))
y_pred_combined = np.concatenate((y_train_pred, y_test_pred))

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)
print(f'Overall accuracy: {overall_accuracy * 100:.2f}%')

# Known label comparison visualization
plt.figure(figsize=(10, 7))
plt.scatter(labeled_data_pca.cpu().numpy()[:, 0], labeled_data_pca.cpu().numpy()[:, 1], c=y_train_true.cpu().numpy(), cmap='rainbow', marker='o', label='True Labels')
plt.scatter(labeled_data_pca.cpu().numpy()[:, 0], labeled_data_pca.cpu().numpy()[:, 1], c=y_train_pred, cmap='rainbow', marker='x', label='Predicted Labels')
plt.title('Known Label Comparison (True label vs Predicted label)')
plt.legend()
plt.colorbar()
plt.show()


NameError: ignored

# Indian Pines

## Kmeans / SVM+Kmeans

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    print(data.keys())  # Add this line
    return data['indian_pines_corrected']


def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

# https://www.ehu.eus/ccwintco/index.php/Hyperspectral_Remote_Sensing_Scenes
data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]
 # we need all classes without 0 class (because 0 is not class label)
all_classes = np.concatenate((known_classes,unknown_classes), axis=0)

all_data = flattened_data[np.isin(flattened_gt, all_classes)]
all_labels = flattened_gt[np.isin(flattened_gt, all_classes)]

known_data = all_data[np.isin(all_labels, known_classes)]
known_labels = all_labels[np.isin(all_labels, known_classes)]
unknown_data = all_data[np.isin(all_labels, unknown_classes)]
unknown_labels = all_labels[np.isin(all_labels, unknown_classes)]

# PCA: before OR after the splitting?
# In this case: after the splitting

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled,unknown_data), axis=0)
y_test = np.concatenate((y_test_labeled,unknown_labels), axis=0)

trainN=len(y_train)
testN1=len(y_test_labeled)
testN2=len(unknown_labels)

print(min_value)
print(max_value)
unique, counts = np.unique(flattened_gt, return_counts=True)
print(np.asarray((unique, counts)).T)
print("   ")
print(data.shape)
print(data.shape[0]," x ", data.shape[1], " = ", data.shape[0]*data.shape[1])
print(known_data.shape)
print(known_labels.shape)
print("  ")
print(X_train_origin.shape)
print(X_test_origin.shape)
print(trainN)
print(testN1)
print(testN2)

dict_keys(['__header__', '__version__', '__globals__', 'indian_pines_corrected'])
955
9604
[[    0 10776]
 [    1    46]
 [    2  1428]
 [    3   830]
 [    4   237]
 [    5   483]
 [    6   730]
 [    7    28]
 [    8   478]
 [    9    20]
 [   10   972]
 [   11  2455]
 [   12   593]
 [   13   205]
 [   14  1265]
 [   15   386]
 [   16    93]]
   
(145, 145, 200)
145  x  145  =  21025
(5088, 200)
(5088,)
  
(4070, 200)
(6179, 200)
4070
1018
5161


In [None]:
pca = PCA(n_components=30)  # are the results worse without PCA?
pca.fit(X_train_origin)
X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)  # Transform all data using PCA

print(X_train.shape)
print(X_test.shape)

(4070, 30)
(6179, 30)


In [None]:
# Spectral Clustering with 16 clusters (adjust number of clusters if needed)
#clustering = SpectralClustering(n_clusters=6, assign_labels='kmeans', random_state=42)
clustering = KMeans(n_clusters=16, random_state=42)
all_data_pca = np.vstack([X_train, X_test])
all_cluster_labels = clustering.fit_predict(all_data_pca)

unique, counts = np.unique(all_cluster_labels, return_counts=True)
print(unique, counts)



[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] [ 847  467  799  496 1478  177  551  451  453   78 1573  750  624  594
  511  400]


In [None]:
# Assign all_cluster_labels with class_labels in X_train
import statistics
from statistics import mode

all_cluster_labels_train = all_cluster_labels[:trainN]
all_cluster_labels_test = all_cluster_labels[trainN:]

all_cluster_labels_a = all_cluster_labels_train[np.isin(y_train, known_classes[0])]
all_cluster_labels_b = all_cluster_labels_train[np.isin(y_train, known_classes[1])]
all_cluster_labels_c = all_cluster_labels_train[np.isin(y_train, known_classes[2])]
print(len(all_cluster_labels_a))
print(len(all_cluster_labels_b))
print(len(all_cluster_labels_c))

# print(statistics.mode(all_cluster_labels_a))
# print(statistics.mode(all_cluster_labels_b))
# print(statistics.mode(all_cluster_labels_c))

# Statistics of Class1, Class10, Class11 with different clusters:
# print(np.unique(all_cluster_labels_a, return_counts=True))
# print(np.unique(all_cluster_labels_b, return_counts=True))
# print(np.unique(all_cluster_labels_c, return_counts=True))

AssignMatrix=np.zeros((3, 6))
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)
print(AssignMatrix)

# Instead of counts the relative ratio is better:
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)/len(all_cluster_labels_a)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)/len(all_cluster_labels_b)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)/len(all_cluster_labels_c)
print(AssignMatrix)

1152
185
385
[[105.   0. 240.   1. 194.   8.]
 [  1.   0.  25.   0.   4.   0.]
 [  0.   6.   0.  33.   1.   0.]]
[[0.09114583 0.         0.20833333 0.00086806 0.16840278 0.00694444]
 [0.00540541 0.         0.13513514 0.         0.02162162 0.        ]
 [0.         0.01558442 0.         0.08571429 0.0025974  0.        ]]


In [None]:
# Hungarian method for best (maximized) assignment
from scipy.optimize import linear_sum_assignment
row_ind, col_ind = linear_sum_assignment(AssignMatrix, maximize='true')
print(col_ind)

# col_ind[0] is for class1, col_ind[1] is for class10, col_ind[2] is for class11

[4 2 3]


In [None]:
# 1. method
# Using only clustering and assigment between the known classes and clusters
# inference: use the cluster ID for classification and clustering
# instance in X_test is predicted based on the instance's cluster ID
# E.g.:
# if 0 then this belongs to Cluster0
# if 1 then this belongs to Class1
# if 2 then this belongs to Class11
# if 3 then this belongs to Cluster3
# if 4 then this belongs to Cluster4
# if 5 then this belongs to Class10

# Use KMeans clustering on the training data
n_clusters = len(known_classes)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_origin)

# Predict cluster IDs for all data
all_cluster_labels = kmeans.predict(all_data)

# A mapping between cluster IDs and known classes
# This mapping needs to be determined based on domain knowledge or some logic
# Here's a dummy mapping as an example:
col_ind = [0, 1, 2]  # map cluster 0 to known_classes[0], cluster 1 to known_classes[1], etc.

i = 0
while i < len(all_cluster_labels):
    for j in range(len(col_ind)):
        if all_cluster_labels[i] == col_ind[j]:
            all_cluster_labels[i] = known_classes[j]
    i += 1

y_test_labeled_Predicted = all_cluster_labels[trainN:trainN+testN1]
unknown_labels_Predicted = all_cluster_labels[trainN+testN1:trainN+testN1+testN2]

print(len(y_test_labeled_Predicted))
print(len(unknown_labels_Predicted))




1358
3463


In [None]:
# 2. method
# Use an extended classifier, where the rest of clusters added as dummy classes
# a. Training a classifier with:
#    Train set with ground true labels + Subset of Test set (belonging to rest clusters)
# b. Test phase of the classifier, where inference is the same as in 1. method

rest_clusters=list(range(len(all_classes)))
for i in range (len(col_ind)):
  rest_clusters.remove(col_ind[i])
print(rest_clusters)

X_test_subset=X_test[np.isin(all_cluster_labels_test, rest_clusters)]
y_test_subset=y_test[np.isin(all_cluster_labels_test, rest_clusters)]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))# Use KMeans clustering on the training data
n_clusters = len(all_classes)  # clustering into total number of classes
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_origin)

# Predict cluster IDs for all data
all_cluster_labels_train = kmeans.predict(X_train_origin)
all_cluster_labels_test = kmeans.predict(X_test_origin)


[0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


KeyboardInterrupt: ignored

In [None]:
# Use KMeans clustering on the training data
n_clusters = len(all_classes)  # clustering into total number of classes
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_origin)

# Predict cluster IDs for all data
all_cluster_labels_train = kmeans.predict(X_train_origin)
all_cluster_labels_test = kmeans.predict(X_test_origin)

# A mapping between cluster IDs and known classes
# This mapping needs to be determined based on domain knowledge or some logic
# Here's a dummy mapping as an example:
col_ind = [0, 1, 2]  # map cluster 0 to known_classes[0], cluster 1 to known_classes[1], etc.

rest_clusters = list(range(len(all_classes)))
for i in range(len(col_ind)):
    rest_clusters.remove(col_ind[i])
print(rest_clusters)

X_test_subset = X_test_origin[np.isin(all_cluster_labels_test, rest_clusters)]
y_test_subset = y_test[np.isin(all_cluster_labels_test, rest_clusters)]

# Combine training data with subset of test data from the 'rest' clusters
all_data_combined = np.vstack([X_train_origin, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

# Predict using the trained SVM
y_test_pred2 = svm_classifier.predict(X_test_origin)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))



[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
1358
3463


In [None]:
# ----------------------------------------------------------------
# Evaluation of the methods
# metrics from https://www.geeksforgeeks.org/clustering-performance-evaluation-in-scikit-learn/

from sklearn.metrics import adjusted_rand_score, rand_score, fowlkes_mallows_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score

print("Results of the 1. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted))
print(" ")
print("Results of the 2. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted2))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted2))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted2))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted2))

Results of the 1. method
accuracy:  0.0625920471281296
rand index:  0.4671171831691732
adjusted rand index:  0.00022614368110274738
fowlkes mallows :  0.3588729668439119
 
Results of the 2. method
accuracy:  0.46244477172312226
rand index:  0.8470791246507396
adjusted rand index:  0.6256625341388995
fowlkes mallows :  0.7393483181624583


## NN + Kmeans

In [None]:
# 2. method
# Use an extended classifier, where the rest of clusters added as dummy classes
# a. Training a classifier with:
#    Train set with ground true labels + Subset of Test set (belonging to rest clusters)
# b. Test phase of the classifier, where inference is the same as in 1. method
from sklearn.metrics import adjusted_rand_score, rand_score, fowlkes_mallows_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

rest_clusters = list(range(len(all_classes)))
for i in range(len(col_ind)):
    rest_clusters.remove(col_ind[i])
print(rest_clusters)

X_test_subset = X_test[np.isin(all_cluster_labels_test, rest_clusters)]
y_test_subset = y_test[np.isin(all_cluster_labels_test, rest_clusters)]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# (Remaining unchanged part)
# Use KMeans clustering on the training data
n_clusters = len(all_classes)  # clustering into total number of classes
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_origin)

# Predict cluster IDs for all data
all_cluster_labels_train = kmeans.predict(X_train_origin)
all_cluster_labels_test = kmeans.predict(X_test_origin)


print("Results of the method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted2))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted2))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted2))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted2))



[0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


1018
5161
Results of the method
accuracy:  0.8261296660117878
rand index:  0.7657880586209331
adjusted rand index:  0.343847847548293
fowlkes mallows :  0.5007345005796782


## Spectral Clustering / Spectral Clustering + SVM

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    print(data.keys())  # Add this line
    return data['indian_pines_corrected']


def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

# https://www.ehu.eus/ccwintco/index.php/Hyperspectral_Remote_Sensing_Scenes
data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]
 # we need all classes without 0 class (because 0 is not class label)
all_classes = np.concatenate((known_classes,unknown_classes), axis=0)

all_data = flattened_data[np.isin(flattened_gt, all_classes)]
all_labels = flattened_gt[np.isin(flattened_gt, all_classes)]

known_data = all_data[np.isin(all_labels, known_classes)]
known_labels = all_labels[np.isin(all_labels, known_classes)]
unknown_data = all_data[np.isin(all_labels, unknown_classes)]
unknown_labels = all_labels[np.isin(all_labels, unknown_classes)]

# PCA: before OR after the splitting?
# In this case: after the splitting

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled,unknown_data), axis=0)
y_test = np.concatenate((y_test_labeled,unknown_labels), axis=0)

trainN=len(y_train)
testN1=len(y_test_labeled)
testN2=len(unknown_labels)

print(min_value)
print(max_value)
unique, counts = np.unique(flattened_gt, return_counts=True)
print(np.asarray((unique, counts)).T)
print("   ")
print(data.shape)
print(data.shape[0]," x ", data.shape[1], " = ", data.shape[0]*data.shape[1])
print(known_data.shape)
print(known_labels.shape)
print("  ")
print(X_train_origin.shape)
print(X_test_origin.shape)
print(trainN)
print(testN1)
print(testN2)

dict_keys(['__header__', '__version__', '__globals__', 'indian_pines_corrected'])
955
9604
[[    0 10776]
 [    1    46]
 [    2  1428]
 [    3   830]
 [    4   237]
 [    5   483]
 [    6   730]
 [    7    28]
 [    8   478]
 [    9    20]
 [   10   972]
 [   11  2455]
 [   12   593]
 [   13   205]
 [   14  1265]
 [   15   386]
 [   16    93]]
   
(145, 145, 200)
145  x  145  =  21025
(5088, 200)
(5088,)
  
(4070, 200)
(6179, 200)
4070
1018
5161


In [None]:
pca = PCA(n_components=30)  # are the results worse without PCA?
pca.fit(X_train_origin)
X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)  # Transform all data using PCA

print(X_train.shape)
print(X_test.shape)

(4070, 30)
(6179, 30)


In [None]:
# Spectral Clustering with 16 clusters (adjust number of clusters if needed)
clustering = SpectralClustering(n_clusters=16, assign_labels='kmeans', random_state=42)
# clustering = KMeans(n_clusters=6, random_state=42)
all_data_pca = np.vstack([X_train, X_test])
all_cluster_labels = clustering.fit_predict(all_data_pca)

unique, counts = np.unique(all_cluster_labels, return_counts=True)
print(unique, counts)

[0 1 2 3 4 5] [2103 5024  891  595 1145  491]


In [None]:
# Assign all_cluster_labels with class_labels in X_train
import statistics
from statistics import mode

all_cluster_labels_train = all_cluster_labels[:trainN]
all_cluster_labels_test = all_cluster_labels[trainN:]

all_cluster_labels_a = all_cluster_labels_train[np.isin(y_train, known_classes[0])]
all_cluster_labels_b = all_cluster_labels_train[np.isin(y_train, known_classes[1])]
all_cluster_labels_c = all_cluster_labels_train[np.isin(y_train, known_classes[2])]
print(len(all_cluster_labels_a))
print(len(all_cluster_labels_b))
print(len(all_cluster_labels_c))

# print(statistics.mode(all_cluster_labels_a))
# print(statistics.mode(all_cluster_labels_b))
# print(statistics.mode(all_cluster_labels_c))

# Statistics of Class1, Class10, Class11 with different clusters:
# print(np.unique(all_cluster_labels_a, return_counts=True))
# print(np.unique(all_cluster_labels_b, return_counts=True))
# print(np.unique(all_cluster_labels_c, return_counts=True))

AssignMatrix=np.zeros((3, 6))
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)
print(AssignMatrix)

# Instead of counts the relative ratio is better:
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)/len(all_cluster_labels_a)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)/len(all_cluster_labels_b)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)/len(all_cluster_labels_c)
print(AssignMatrix)

1152
185
385
[[ 17. 727. 345.   0.   0.  63.]
 [ 51.  45.  71.   0.   0.  18.]
 [122.   3.   0. 172.  85.   3.]]
[[0.01475694 0.63107639 0.29947917 0.         0.         0.0546875 ]
 [0.27567568 0.24324324 0.38378378 0.         0.         0.0972973 ]
 [0.31688312 0.00779221 0.         0.44675325 0.22077922 0.00779221]]


In [None]:
# Hungarian method for best (maximized) assignment
from scipy.optimize import linear_sum_assignment
row_ind, col_ind = linear_sum_assignment(AssignMatrix, maximize='true')
print(col_ind)

# col_ind[0] is for class1, col_ind[1] is for class10, col_ind[2] is for class11

[1 2 3]


In [None]:

# Method 1
from sklearn.cluster import SpectralClustering

# Use Spectral Clustering on all data
n_clusters = len(all_classes)
clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='kmeans', random_state=42)

all_cluster_labels = clustering.fit_predict(all_data)  # directly fit and predict on all_data

col_ind = [0, 1, 2]
for i in range(len(all_cluster_labels)):
    for j in range(len(col_ind)):
        if all_cluster_labels[i] == col_ind[j]:
            all_cluster_labels[i] = known_classes[j]

y_test_labeled_Predicted = all_cluster_labels[trainN:trainN+testN1]
unknown_labels_Predicted = all_cluster_labels[trainN+testN1:trainN+testN1+testN2]

print(len(y_test_labeled_Predicted))
print(len(unknown_labels_Predicted))




1018
5161


In [None]:
# Method 2
from sklearn.cluster import SpectralClustering

# Use Spectral Clustering on all data
n_clusters = len(all_classes)
clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='kmeans', random_state=42)

all_cluster_labels = clustering.fit_predict(all_data)  # directly fit and predict on all_data

# A mapping between cluster IDs and known classes
col_ind = [0, 1, 2]
rest_clusters = list(set(all_cluster_labels) - set(col_ind))

X_test_subset = X_test_origin[np.isin(all_cluster_labels[trainN:], rest_clusters)]
y_test_subset = y_test[np.isin(all_cluster_labels[trainN:], rest_clusters)]

# Combine training data with subset of test data from the 'rest' clusters
all_data_combined = np.vstack([X_train_origin, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

# Predict using the trained SVM
y_test_pred2 = svm_classifier.predict(X_test_origin)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))




1018
5161


In [None]:
# ----------------------------------------------------------------
# Evaluation of the methods
# metrics from https://www.geeksforgeeks.org/clustering-performance-evaluation-in-scikit-learn/

from sklearn.metrics import adjusted_rand_score, rand_score, fowlkes_mallows_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score

print("Results of the 1. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted))
print(" ")
print("Results of the 2. method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted2))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted2))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted2))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted2))

Results of the 1. method
accuracy:  0.06974459724950884
rand index:  0.6432351911849306
adjusted rand index:  0.009069313995271779
fowlkes mallows :  0.2265083771343133
 
Results of the 2. method
accuracy:  0.5599214145383105
rand index:  0.7222341382671768
adjusted rand index:  0.41177491173299174
fowlkes mallows :  0.6289327029563248


## NN + Spectral Clustering

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    print(data.keys())  # Add this line
    return data['indian_pines_corrected']


def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

# https://www.ehu.eus/ccwintco/index.php/Hyperspectral_Remote_Sensing_Scenes
data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_classes = [1, 2, 3, 4, 9, 10, 11, 12, 13]
unknown_classes = [5, 6, 7, 8, 14, 15, 16]
 # we need all classes without 0 class (because 0 is not class label)
all_classes = np.concatenate((known_classes,unknown_classes), axis=0)

all_data = flattened_data[np.isin(flattened_gt, all_classes)]
all_labels = flattened_gt[np.isin(flattened_gt, all_classes)]

known_data = all_data[np.isin(all_labels, known_classes)]
known_labels = all_labels[np.isin(all_labels, known_classes)]
unknown_data = all_data[np.isin(all_labels, unknown_classes)]
unknown_labels = all_labels[np.isin(all_labels, unknown_classes)]

# PCA: before OR after the splitting?
# In this case: after the splitting

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled,unknown_data), axis=0)
y_test = np.concatenate((y_test_labeled,unknown_labels), axis=0)

trainN=len(y_train)
testN1=len(y_test_labeled)
testN2=len(unknown_labels)

print(min_value)
print(max_value)
unique, counts = np.unique(flattened_gt, return_counts=True)
print(np.asarray((unique, counts)).T)
print("   ")
print(data.shape)
print(data.shape[0]," x ", data.shape[1], " = ", data.shape[0]*data.shape[1])
print(known_data.shape)
print(known_labels.shape)
print("  ")
print(X_train_origin.shape)
print(X_test_origin.shape)
print(trainN)
print(testN1)
print(testN2)

dict_keys(['__header__', '__version__', '__globals__', 'indian_pines_corrected'])
955
9604
[[    0 10776]
 [    1    46]
 [    2  1428]
 [    3   830]
 [    4   237]
 [    5   483]
 [    6   730]
 [    7    28]
 [    8   478]
 [    9    20]
 [   10   972]
 [   11  2455]
 [   12   593]
 [   13   205]
 [   14  1265]
 [   15   386]
 [   16    93]]
   
(145, 145, 200)
145  x  145  =  21025
(6786, 200)
(6786,)
  
(5428, 200)
(4821, 200)
5428
1358
3463


In [None]:
pca = PCA(n_components=30)  # are the results worse without PCA?
pca.fit(X_train_origin)
X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)  # Transform all data using PCA

print(X_train.shape)
print(X_test.shape)

(5428, 30)
(4821, 30)


In [None]:
# Spectral Clustering with 16 clusters (adjust number of clusters if needed)
clustering = SpectralClustering(n_clusters=16, assign_labels='kmeans', random_state=42)
# clustering = KMeans(n_clusters=6, random_state=42)
all_data_pca = np.vstack([X_train, X_test])
all_cluster_labels = clustering.fit_predict(all_data_pca)

unique, counts = np.unique(all_cluster_labels, return_counts=True)
print(unique, counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] [ 490 2853    2  678   76  831  558  111    5 2351  373   67  640  479
  185  550]


In [None]:
# Assign all_cluster_labels with class_labels in X_train
import statistics
from statistics import mode

all_cluster_labels_train = all_cluster_labels[:trainN]
all_cluster_labels_test = all_cluster_labels[trainN:]

all_cluster_labels_a = all_cluster_labels_train[np.isin(y_train, known_classes[0])]
all_cluster_labels_b = all_cluster_labels_train[np.isin(y_train, known_classes[1])]
all_cluster_labels_c = all_cluster_labels_train[np.isin(y_train, known_classes[2])]
print(len(all_cluster_labels_a))
print(len(all_cluster_labels_b))
print(len(all_cluster_labels_c))

# print(statistics.mode(all_cluster_labels_a))
# print(statistics.mode(all_cluster_labels_b))
# print(statistics.mode(all_cluster_labels_c))

# Statistics of Class1, Class10, Class11 with different clusters:
# print(np.unique(all_cluster_labels_a, return_counts=True))
# print(np.unique(all_cluster_labels_b, return_counts=True))
# print(np.unique(all_cluster_labels_c, return_counts=True))

AssignMatrix=np.zeros((3, 6))
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)
print(AssignMatrix)

# Instead of counts the relative ratio is better:
for i in range(6):
  AssignMatrix[0][i]=list(all_cluster_labels_a).count(i)/len(all_cluster_labels_a)
for i in range(6):
  AssignMatrix[1][i]=list(all_cluster_labels_b).count(i)/len(all_cluster_labels_b)
for i in range(6):
  AssignMatrix[2][i]=list(all_cluster_labels_c).count(i)/len(all_cluster_labels_c)
print(AssignMatrix)

39
1158
663
[[  0.   0.   0.   0.   0.   1.]
 [  3. 336.   0.   0.   0.   4.]
 [  0. 334.   0.   0.   0.   0.]]
[[0.         0.         0.         0.         0.         0.02564103]
 [0.00259067 0.29015544 0.         0.         0.         0.00345423]
 [0.         0.50377074 0.         0.         0.         0.        ]]


In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.neural_network import MLPClassifier  # Importing MLPClassifier

# Use Spectral Clustering on all data
n_clusters = len(all_classes)
clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='kmeans', random_state=42)

all_cluster_labels = clustering.fit_predict(all_data)  # directly fit and predict on all_data

# A mapping between cluster IDs and known classes
col_ind = [0, 1, 2]
rest_clusters = list(set(all_cluster_labels) - set(col_ind))

X_test_subset = X_test_origin[np.isin(all_cluster_labels[trainN:], rest_clusters)]
y_test_subset = y_test[np.isin(all_cluster_labels[trainN:], rest_clusters)]

# Combine training data with subset of test data from the 'rest' clusters
all_data_combined = np.vstack([X_train_origin, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the NN using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)  # Create MLP classifier
nn_classifier.fit(all_data_combined, all_data_labels_combined)  # Train the classifier

# Predict using the trained NN
y_test_pred2 = nn_classifier.predict(X_test_origin)
y_test_labeled_Predicted2 = y_test_pred2[:testN1]
unknown_labels_Predicted2 = y_test_pred2[testN1:]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))


1358
3463


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
print("Results of the NN Spectral method")
print("accuracy: ", accuracy_score(y_test_labeled, y_test_labeled_Predicted2))
print("rand index: ", rand_score(unknown_labels, unknown_labels_Predicted2))
print("adjusted rand index: ", adjusted_rand_score(unknown_labels, unknown_labels_Predicted2))
print("fowlkes mallows : ", fowlkes_mallows_score(unknown_labels, unknown_labels_Predicted2))


Results of the NN Spectral method
accuracy:  0.508100147275405
rand index:  0.8657433797545832
adjusted rand index:  0.6468001372349084
fowlkes mallows :  0.739119693631832


# Semi Kmeans

Split based on sample size

In [None]:
!git clone https://github.com/sgvaze/generalized-category-discovery.git

Cloning into 'generalized-category-discovery'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 76 (delta 13), reused 53 (delta 7), pack-reused 0[K
Receiving objects: 100% (76/76), 3.70 MiB | 8.97 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [None]:
!mv generalized-category-discovery gcd

# After executing this command you will get another error, to solve the new error do the following:
# 1- open "faster_mix_k_means_pytorch" file in the notebook
# 2- change the 4th line as the following:
# old: from project_utils.cluster_utils import cluster_acc
# new: from gcd.project_utils.cluster_utils import cluster_acc
# 3- save the file

In [None]:
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15]
unknown_classes = [1, 7, 9, 16]

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Set k to the number of known classes
k_clusters = len(known_classes)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=k_clusters, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")



(8049, 30) (2200, 30) (8049,) (2013,)
Training accuracy: 51.42%
Accuracy: 51.42%
Rand Index: 0.2102
Adjusted Rand Index: 0.3062
Fowlkes-Mallows Index: 0.4670


In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15]
unknown_classes = [1, 7, 9, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(8049, 30) (2200, 30) (8049,) (2013,)
Training accuracy: 52.07%
Accuracy: 52.07%
Rand Index: 0.1801
Adjusted Rand Index: 0.3215
Fowlkes-Mallows Index: 0.4531


Based on feature type

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [5, 6, 14]
unknown_classes = [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16]

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Set k to the number of known classes
k_clusters = len(known_classes)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=k_clusters, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(1982, 30) (496, 30) (1982,) (496,)
(7771, 30) (2478, 30)
(1982, 30) (8267, 30) (1982,) (496,)
Training accuracy: 74.17%
Accuracy: 74.17%
Rand Index: 0.4826
Adjusted Rand Index: 0.4468
Fowlkes-Mallows Index: 0.7175


In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [5, 6, 14]
unknown_classes = [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(1982, 30) (8267, 30) (1982,) (496,)
Training accuracy: 86.73%
Accuracy: 86.73%
Rand Index: 0.3908
Adjusted Rand Index: 0.6732
Fowlkes-Mallows Index: 0.8009


Random division 1

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Set k to the number of known classes
k_clusters = len(known_classes)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=k_clusters, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(1982, 30) (496, 30) (1982,) (496,)
(7771, 30) (2478, 30)
(4070, 30) (6179, 30) (4070,) (1018,)
Training accuracy: 68.16%
Accuracy: 68.16%
Rand Index: 0.2248
Adjusted Rand Index: 0.5117
Fowlkes-Mallows Index: 0.6240


In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(4070, 30) (6179, 30) (4070,) (1018,)
Training accuracy: 70.96%
Accuracy: 70.96%
Rand Index: 0.2340
Adjusted Rand Index: 0.5463
Fowlkes-Mallows Index: 0.6573


Random division2

In [None]:

import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 5, 6, 8, 11, 13, 15]
unknown_classes = [3, 4, 7, 9, 10, 12, 14, 16]

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Set k to the number of known classes
k_clusters = len(known_classes)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=k_clusters, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(4968, 30) (5281, 30) (4968,) (1243,)
Training accuracy: 68.08%
Accuracy: 68.08%
Rand Index: 0.2989
Adjusted Rand Index: 0.5092
Fowlkes-Mallows Index: 0.6631


In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 5, 6, 8, 11, 13, 15]
unknown_classes = [3, 4, 7, 9, 10, 12, 14, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")


(4968, 30) (5281, 30) (4968,) (1243,)
Training accuracy: 70.09%
Accuracy: 70.09%
Rand Index: 0.2657
Adjusted Rand Index: 0.5154
Fowlkes-Mallows Index: 0.6459


Crop-based divisions:

In [None]:

import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 3, 4, 9, 10, 11, 12, 13]
unknown_classes = [5, 6, 7, 8, 14, 15, 16]

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Set k to the number of known classes
k_clusters = len(known_classes)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=k_clusters, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")

(5428, 30) (4821, 30) (5428,) (1358,)
Training accuracy: 44.93%
Accuracy: 44.93%
Rand Index: 0.3149
Adjusted Rand Index: 0.1244
Fowlkes-Mallows Index: 0.3931


In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from sklearn.metrics import confusion_matrix
from scipy.special import comb

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 3, 4, 9, 10, 11, 12, 13]
unknown_classes = [5, 6, 7, 8, 14, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

print(X_train.shape, X_test.shape, y_train.shape, y_test_labeled.shape)

y_train_true = y_train.copy()
y_test_true = y_test_labeled.copy()

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Map cluster labels back to original labels
cluster_labels = semi_sup_kmeans.labels_.cpu().numpy()
unique_clusters = np.unique(cluster_labels)

y_train_pred = np.empty_like(y_train)

for cluster in unique_clusters:
    mask = (cluster_labels == cluster)
    if mask.sum() > 0:
        common_label_mode = mode(y_train_true[mask]).mode
        common_label = common_label_mode[0] if np.ndim(common_label_mode) > 0 else common_label_mode
        y_train_pred[mask] = common_label

# Calculate and print accuracy
train_accuracy = accuracy_score(y_train_true, y_train_pred)
print(f'Training accuracy: {train_accuracy * 100:.2f}%')

# Combine the true labels and predicted labels
y_true_combined = y_train_true
y_pred_combined = y_train_pred

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

# Calculate metrics
ari = adjusted_rand_score(y_true_combined, y_pred_combined)
fmi = fowlkes_mallows_score(y_true_combined, y_pred_combined)

def rand_index_score(y_true, y_pred):
    # Create a confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate values
    a = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=1)])
    b = 0.5 * np.sum([comb(k, 2) for k in cm.sum(axis=0)])
    c = 0.5 * np.sum([comb(k, 2) for k in np.ravel(cm)])
    n = comb(len(y_true), 2)

    # Compute the Rand Index
    ri = (a + b) / n
    return ri

# Calculate Rand Index
ri = rand_index_score(y_true_combined, y_pred_combined)

print(f"Accuracy: {overall_accuracy * 100:.2f}%")
print(f"Rand Index: {ri:.4f}")
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Fowlkes-Mallows Index: {fmi:.4f}")


(5428, 30) (4821, 30) (5428,) (1358,)
Training accuracy: 46.55%
Accuracy: 46.55%
Rand Index: 0.3107
Adjusted Rand Index: 0.1364
Fowlkes-Mallows Index: 0.3990


# `SEMI Kmeans`+ SVM

Crop-based divisions:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix


def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 3, 4, 9, 10, 11, 12, 13]
unknown_classes = [5, 6, 7, 8, 14, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
# Assuming the closest centroids can be our cluster prediction for test data.
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

# Create a boolean index based on the test labels
mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)

# Use the boolean index to subset the test data
X_test_subset = X_test[mask_test_subset]

# Now, since X_test_subset might contain data both from X_test_labeled and unknown_data,
# we should also subset the labels. To do this, we create a combined y_test which covers all data in X_test.
# Here, we assign a label of -1 for unknown_data, just to keep the shape consistent.
# NOTE: These -1 labels are not used for training the SVM, they are just placeholders.
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])

# Now, we can use mask_test_subset to subset this combined y_test.
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# Real tags include real tags for known categories and real tags for unknown categories
y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

# Calculate assessment indicators
accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    """Compute the Rand Index."""
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    """The number of combinations of n things taken 2 at a time."""
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)  # Fixed this line
print("Fowlkes Mallows Score:", fms)  # Fixed this line

1358
3463
Rand Index: 0.2062372154765192
Accuracy: 0.6539027982326951
Adjusted Rand Index: 0.2062372154765192
Fowlkes Mallows Score: 0.4727867332573832
Fowlkes Mallows Score: 0.4918697609785762


Random division 2:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix


def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 5, 6, 8, 11, 13, 15]
unknown_classes = [3, 4, 7, 9, 10, 12, 14, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
# Assuming the closest centroids can be our cluster prediction for test data.
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

# Create a boolean index based on the test labels
mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)

# Use the boolean index to subset the test data
X_test_subset = X_test[mask_test_subset]

# Now, since X_test_subset might contain data both from X_test_labeled and unknown_data,
# we should also subset the labels. To do this, we create a combined y_test which covers all data in X_test.
# Here, we assign a label of -1 for unknown_data, just to keep the shape consistent.
# NOTE: These -1 labels are not used for training the SVM, they are just placeholders.
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])

# Now, we can use mask_test_subset to subset this combined y_test.
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# Real tags include real tags for known categories and real tags for unknown categories
y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

# Calculate assessment indicators
accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    """Compute the Rand Index."""
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    """The number of combinations of n things taken 2 at a time."""
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)  # Fixed this line
print("Fowlkes Mallows Score:", fms)  # Fixed this line

1243
4038
Rand Index: 0.1343409205399869
Accuracy: 0.7691069991954947
Adjusted Rand Index: 0.13434092053998692
Fowlkes Mallows Score: 0.3468427092572668


Based on feature type:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix


def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [5, 6, 14]
unknown_classes = [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
# Assuming the closest centroids can be our cluster prediction for test data.
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

# Create a boolean index based on the test labels
mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)

# Use the boolean index to subset the test data
X_test_subset = X_test[mask_test_subset]

# Now, since X_test_subset might contain data both from X_test_labeled and unknown_data,
# we should also subset the labels. To do this, we create a combined y_test which covers all data in X_test.
# Here, we assign a label of -1 for unknown_data, just to keep the shape consistent.
# NOTE: These -1 labels are not used for training the SVM, they are just placeholders.
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])

# Now, we can use mask_test_subset to subset this combined y_test.
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# Real tags include real tags for known categories and real tags for unknown categories
y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

# Calculate assessment indicators
accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    """Compute the Rand Index."""
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    """The number of combinations of n things taken 2 at a time."""
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)  # Fixed this line
print("Fowlkes Mallows Score:", fms)  # Fixed this line

496
7771
Rand Index: 0.06384410343570252
Accuracy: 0.9415322580645161
Adjusted Rand Index: 0.06384410343570252
Fowlkes Mallows Score: 0.4247424236481274


Based on sample size

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix


def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15]
unknown_classes = [1, 7, 9, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
# Assuming the closest centroids can be our cluster prediction for test data.
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

# Create a boolean index based on the test labels
mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)

# Use the boolean index to subset the test data
X_test_subset = X_test[mask_test_subset]

# Now, since X_test_subset might contain data both from X_test_labeled and unknown_data,
# we should also subset the labels. To do this, we create a combined y_test which covers all data in X_test.
# Here, we assign a label of -1 for unknown_data, just to keep the shape consistent.
# NOTE: These -1 labels are not used for training the SVM, they are just placeholders.
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])

# Now, we can use mask_test_subset to subset this combined y_test.
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# Real tags include real tags for known categories and real tags for unknown categories
y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

# Calculate assessment indicators
accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    """Compute the Rand Index."""
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    """The number of combinations of n things taken 2 at a time."""
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)  # Fixed this line
print("Fowlkes Mallows Score:", fms)  # Fixed this line

2013
187
Rand Index: 0.5076934125151138
Accuracy: 0.7401887729756582
Adjusted Rand Index: 0.5076934125151138
Fowlkes Mallows Score: 0.5736815122394838


Random division 1:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix


def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
# Assuming the closest centroids can be our cluster prediction for test data.
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

# Create a boolean index based on the test labels
mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)

# Use the boolean index to subset the test data
X_test_subset = X_test[mask_test_subset]

# Now, since X_test_subset might contain data both from X_test_labeled and unknown_data,
# we should also subset the labels. To do this, we create a combined y_test which covers all data in X_test.
# Here, we assign a label of -1 for unknown_data, just to keep the shape consistent.
# NOTE: These -1 labels are not used for training the SVM, they are just placeholders.
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])

# Now, we can use mask_test_subset to subset this combined y_test.
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the SVM using combined data and labels
svm_classifier = SVC()
svm_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = svm_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

# Real tags include real tags for known categories and real tags for unknown categories
y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

# Calculate assessment indicators
accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    """Compute the Rand Index."""
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    """The number of combinations of n things taken 2 at a time."""
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)  # Fixed this line
print("Fowlkes Mallows Score:", fms)  # Fixed this line

1018
5161
Rand Index: 0.14035825564596224
Accuracy: 0.6679764243614931
Adjusted Rand Index: 0.14035825564596227
Fowlkes Mallows Score: 0.4494561290738025


# SemiSup Kmeans + Neaural Network

Random division 1:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from sklearn.neural_network import MLPClassifier  # Importing the MLPClassifier
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 4, 5, 8, 9, 10, 13, 14]
unknown_classes = [1, 3, 6, 7, 11, 12, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)
X_test_subset = X_test[mask_test_subset]
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)
print("Fowlkes Mallows Score:", fms)


1018
5161
Rand Index: 0.1360530151535562
Accuracy: 0.7819253438113949
Adjusted Rand Index: 0.13605301515355625
Fowlkes Mallows Score: 0.44076660276571206


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Based on sample size:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from sklearn.neural_network import MLPClassifier  # Importing the MLPClassifier
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15]
unknown_classes = [1, 7, 9, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)
X_test_subset = X_test[mask_test_subset]
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)
print("Fowlkes Mallows Score:", fms)

2013
187
Rand Index: 0.5644583827585434
Accuracy: 0.7754595131644312
Adjusted Rand Index: 0.5644583827585434
Fowlkes Mallows Score: 0.6148338242155408


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Based on feature type:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from sklearn.neural_network import MLPClassifier  # Importing the MLPClassifier
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [5, 6, 14]
unknown_classes = [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)
X_test_subset = X_test[mask_test_subset]
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)
print("Fowlkes Mallows Score:", fms)

496
7771
Rand Index: 0.05473057147905564
Accuracy: 0.9536290322580645
Adjusted Rand Index: 0.05473057147905563
Fowlkes Mallows Score: 0.4206174446049294


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Random division 2:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from sklearn.neural_network import MLPClassifier  # Importing the MLPClassifier
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 5, 6, 8, 11, 13, 15]
unknown_classes = [3, 4, 7, 9, 10, 12, 14, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)
X_test_subset = X_test[mask_test_subset]
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)
print("Fowlkes Mallows Score:", fms)

1243
4038
Rand Index: 0.14043219864718193
Accuracy: 0.7787610619469026
Adjusted Rand Index: 0.1404321986471819
Fowlkes Mallows Score: 0.34768231184960885


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Crop-based divisions:

In [None]:
import numpy as np
import scipy.io
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import torch
from sklearn.neural_network import MLPClassifier  # Importing the MLPClassifier
from gcd.methods.clustering.faster_mix_k_means_pytorch import K_Means as SemiSupKMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score, fowlkes_mallows_score
from itertools import combinations
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_corrected']

def load_gt(file_path):
    data = scipy.io.loadmat(file_path)
    return data['indian_pines_gt']

data = load_data('/content/Indian_pines_corrected.mat')
gt_data = load_gt('/content/Indian_pines_gt.mat')

# Normalize data
min_value = np.min(data)
max_value = np.max(data)
normalized_data = (data - min_value) / (max_value - min_value)

# Data Processing
known_classes = [1, 2, 3, 4, 9, 10, 11, 12, 13]
unknown_classes = [5, 6, 7, 8, 14, 15, 16]
all_classes = known_classes + unknown_classes
large_N = len(all_classes)

flattened_data = normalized_data.reshape((-1, normalized_data.shape[-1]))
flattened_gt = gt_data.flatten()

known_data = flattened_data[np.isin(flattened_gt, known_classes)]
known_labels = flattened_gt[np.isin(flattened_gt, known_classes)]
unknown_data = flattened_data[np.isin(flattened_gt, unknown_classes)]

X_train_origin, X_test_labeled, y_train, y_test_labeled = train_test_split(known_data, known_labels, test_size=0.2, random_state=42)
X_test_origin = np.concatenate((X_test_labeled, unknown_data), axis=0)

# PCA for dimensionality reduction
pca = PCA(n_components=30)
pca.fit(known_data)

X_train = pca.transform(X_train_origin)
X_test = pca.transform(X_test_origin)

# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.from_numpy(X_train).to(device)
y_train_true_tensor = torch.tensor(y_train).to(device)

# Create and fit the semi-supervised K-Means model
semi_sup_kmeans = SemiSupKMeans(k=large_N, init='k-means++', n_init=10, random_state=42, n_jobs=None, pairwise_batch_size=10)
semi_sup_kmeans.fit(X_train_tensor)

# Get cluster labels
all_cluster_labels_train = semi_sup_kmeans.labels_.cpu().numpy()

X_test_tensor = torch.from_numpy(X_test).to(device)
with torch.no_grad():
    dist = ((X_test_tensor.unsqueeze(1) - semi_sup_kmeans.cluster_centers_)**2).sum(-1)
    all_cluster_labels_test = dist.argmin(1).cpu().numpy()

# The rest of method2 using the cluster IDs
col_ind = [0, 1, 2]
rest_clusters = list(range(large_N))
for i in col_ind:
    rest_clusters.remove(i)

mask_test_subset = np.isin(all_cluster_labels_test, rest_clusters)
X_test_subset = X_test[mask_test_subset]
y_test_combined = np.concatenate([y_test_labeled, [-1 for _ in range(len(unknown_data))]])
y_test_subset = y_test_combined[mask_test_subset]

all_data_combined = np.vstack([X_train, X_test_subset])
all_data_labels_combined = np.concatenate([y_train, y_test_subset])

# Train the Neural Network (MLP) using combined data and labels
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1)
nn_classifier.fit(all_data_combined, all_data_labels_combined)

y_test_pred2 = nn_classifier.predict(X_test)
y_test_labeled_Predicted2 = y_test_pred2[:len(y_test_labeled)]
unknown_labels_Predicted2 = y_test_pred2[len(y_test_labeled):]

print(len(y_test_labeled_Predicted2))
print(len(unknown_labels_Predicted2))

y_test_all = np.concatenate([y_test_labeled, flattened_gt[np.isin(flattened_gt, unknown_classes)]])

accuracy = accuracy_score(y_test_labeled, y_test_labeled_Predicted2)
ari = adjusted_rand_score(y_test_all, y_test_pred2)
fms = fowlkes_mallows_score(y_test_all, y_test_pred2)

def rand_index(labels_true, labels_pred):
    contingency = confusion_matrix(labels_true, labels_pred)
    n = contingency.sum()
    sum_comb = sum(sum(comb2(i) for i in row) for row in contingency)
    sum_comb_rows = sum(comb2(r.sum()) for r in contingency)
    sum_comb_cols = sum(comb2(c.sum()) for c in contingency.T)
    expected_index = sum_comb_rows * sum_comb_cols / comb2(n)
    max_index = 0.5 * (sum_comb_rows + sum_comb_cols)
    return (sum_comb - expected_index) / (max_index - expected_index)

def comb2(n):
    return comb(n, 2)

ri = rand_index(y_test_all, y_test_pred2)
print("Rand Index:", ri)
print("Accuracy:", accuracy)
print("Adjusted Rand Index:", ari)
print("Fowlkes Mallows Score:", fms)

1358
3463
Rand Index: 0.2154149163796027
Accuracy: 0.7356406480117821
Adjusted Rand Index: 0.2154149163796027
Fowlkes Mallows Score: 0.4752907084015942


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
