In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import cv2
import os
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding

In [2]:
def load_images_from_folder(folder):
    image_list = []
    labels_list = []
    for filename in os.listdir(folder):
        path = os.path.join(folder, filename)
        if os.path.isdir(path):
            for img_name in os.listdir(path):
                img_path = os.path.join(path, img_name)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    image_list.append(img)
                    labels_list.append(filename) 
    return image_list, labels_list

# Loading images
grouped_dir = "/Users/thienphuong/Desktop/DATA-4381-Capstone-Project/coil-100/processed/grouped"
images, labels = load_images_from_folder(grouped_dir)

In [3]:
classifiers = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    'SVM Linear': SVC(kernel='linear'),
    'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

In [4]:
sample_size = 1000
indices = np.random.choice(len(images), sample_size, replace=False)
sampled_images = [images[i] for i in indices]
sampled_labels = [labels[i] for i in indices]
subset_train_reshaped = np.array(sampled_images).reshape(len(sampled_images), -1)
subset_train_labels = np.array(sampled_labels)

In [5]:
no_embedding = pd.read_csv('/Users/thienphuong/Desktop/DATA-4381-Capstone-Project/coil-100/no_emb_coil100.csv')
no_embedding

Unnamed: 0.1,Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
0,No Embedding,0.75 ± 0.00,0.81 ± 0.00,0.69 ± 0.00,0.58 ± 0.00,0.84 ± 0.00,0.58 ± 0.00,0.80 ± 0.00,0.77 ± 0.00


In [6]:
no_embedding = pd.read_csv('/Users/thienphuong/Desktop/DATA-4381-Capstone-Project/coil-100/no_emb_coil100.csv')
if 'Unnamed: 0' in no_embedding.columns:
    no_embedding.set_index('Unnamed: 0', inplace=True)
    no_embedding.index.name = None

no_embedding

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.75 ± 0.00,0.81 ± 0.00,0.69 ± 0.00,0.58 ± 0.00,0.84 ± 0.00,0.58 ± 0.00,0.80 ± 0.00,0.77 ± 0.00


# MDS 

In [7]:
eps_values = [1,1e-3, 1e-5]
embeddings = {}
for eps in eps_values:
    embedding = MDS(n_components=10, metric=True, eps=eps, random_state=0)
    train_transformed = embedding.fit_transform(subset_train_reshaped)
    embeddings[(True, eps)] = train_transformed



In [8]:
mds_accuracy_results = {name: [] for name in classifiers.keys()}
for (metric_value, eps), train_transformed in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, subset_train_labels, test_size=0.3, random_state=42)
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append((metric_value, eps, accuracy))
        #print(f"{name} - MDS (metric={'Metric' if metric_value else 'Non-metric'}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    accuracies = [acc[-1] for acc in acc_list]  # Extract only the accuracy values
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [10]:
mds_df_10 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.64 ± 0.00,0.69 ± 0.00,0.62 ± 0.00,0.58 ± 0.00,0.72 ± 0.00,0.64 ± 0.00,0.63 ± 0.00,0.57 ± 0.00


In [11]:
### 100 components 

In [12]:
eps_values = [1,1e-3, 1e-5]
embeddings = {}
for eps in eps_values:
    embedding = MDS(n_components=100, metric=True, eps=eps, random_state=0)
    train_transformed = embedding.fit_transform(subset_train_reshaped)
    embeddings[(True, eps)] = train_transformed



In [15]:
mds_accuracy_results = {name: [] for name in classifiers.keys()}
for (metric_value, eps), train_transformed in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, subset_train_labels, test_size=0.3, random_state=42)
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append((metric_value, eps, accuracy))
        #print(f"{name} - MDS (metric={'Metric' if metric_value else 'Non-metric'}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

mds_mean_std_accuracy_results_100 = {}
for name, acc_list in mds_accuracy_results.items():
    accuracies = [acc[-1] for acc in acc_list]  # Extract only the accuracy values
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mds_mean_std_accuracy_results_100[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [16]:
mds_df_100 = pd.DataFrame([mds_mean_std_accuracy_results_100], index=['MDS'])
mds_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.66 ± 0.00,0.68 ± 0.00,0.58 ± 0.00,0.54 ± 0.00,0.76 ± 0.00,0.62 ± 0.00,0.62 ± 0.01,0.67 ± 0.00


# Iso Map 

In [17]:
n_neighbors_values = [10, 20, 30, 40]
n_components_values = [10]

transformed_isomap_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_data = isomap.fit_transform(subset_train_reshaped)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_data))

In [18]:
isomap_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        isomap_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
isomap_mean_std_accuracy_results = {}
for name, results in isomap_accuracy_results.items():
    accuracies = [acc[2] for acc in results]  
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"
isomap_df_10 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.54 ± 0.03,0.68 ± 0.03,0.61 ± 0.01,0.56 ± 0.01,0.69 ± 0.02,0.49 ± 0.02,0.67 ± 0.01,0.57 ± 0.02


In [18]:
### 100 components 

In [20]:
n_neighbors_values = [10, 20, 30, 40]
n_components_values = [100]

transformed_isomap_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_data = isomap.fit_transform(subset_train_reshaped)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_data))

In [21]:
isomap_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        isomap_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

In [22]:
isomap_mean_std_accuracy_results_100 = {}
for name, results in isomap_accuracy_results.items():
    accuracies = [acc[2] for acc in results]  
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    isomap_mean_std_accuracy_results_100[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"
isomap_df_100 = pd.DataFrame([isomap_mean_std_accuracy_results_100], index=['Isomap'])
isomap_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.72 ± 0.02,0.71 ± 0.02,0.63 ± 0.02,0.55 ± 0.01,0.75 ± 0.02,0.65 ± 0.02,0.69 ± 0.02,0.74 ± 0.03


# T-SNE 

In [23]:
perplexity_values = [5, 30, 50, 100]
n_components_values = [2, 3] 

transformed_tsne_data_list = []

for perplexity_val in perplexity_values:
    for n_components_val in n_components_values:
        tsne = TSNE(n_components=n_components_val, perplexity=perplexity_val, random_state=42)
        transformed_data = tsne.fit_transform(subset_train_reshaped)
        transformed_tsne_data_list.append((perplexity_val, n_components_val, transformed_data))

In [24]:
tsne_accuracy_results = {name: [] for name in classifiers.keys()}
for perplexity_val, n_components_val, transformed_data in transformed_tsne_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        # Store accuracy along with parameters
        tsne_accuracy_results[name].append((perplexity_val, n_components_val, accuracy))
        #print(f"{name} with t-SNE (perplexity={perplexity_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [25]:
tsne_mean_std_accuracy_results = {}

for name, results in tsne_accuracy_results.items():
    accuracies = [acc[2] for acc in results]  
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    tsne_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

tsne_df = pd.DataFrame([tsne_mean_std_accuracy_results], index=['t-SNE'])
tsne_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
t-SNE,0.41 ± 0.07,0.78 ± 0.01,0.70 ± 0.02,0.63 ± 0.02,0.58 ± 0.04,0.34 ± 0.08,0.73 ± 0.05,0.45 ± 0.05


# LLE 

In [27]:
n_neighbors_values = [2, 3, 5, 7, 10]
n_components_values = [10]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        lle = LocallyLinearEmbedding(
            n_neighbors=n_neighbors_val,
            n_components=n_components_val,
            #ewigen_solver='dense', 
            random_state=42
        )
        transformed_data = lle.fit_transform(subset_train_reshaped)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_data))

In [28]:
lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    
    # Evaluate each classifier
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        lle_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")


In [29]:
lle_mean_std_accuracy_results = {}

for name, results in lle_accuracy_results.items():
    accuracies = [acc[2] for acc in results] 
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"
lle_df_10 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.43 ± 0.06,0.67 ± 0.01,0.61 ± 0.02,0.57 ± 0.02,0.01 ± 0.00,0.41 ± 0.07,0.57 ± 0.03,0.03 ± 0.01


In [30]:
n_neighbors_values = [2, 3, 5, 7, 10]
n_components_values = [10]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        lle = LocallyLinearEmbedding(
            n_neighbors=n_neighbors_val,
            n_components=n_components_val,
           #ewigen_solver='dense', 
            random_state=42
        )
        transformed_data = lle.fit_transform(subset_train_reshaped)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_data))

lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    
    # Evaluate each classifier
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        lle_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

lle_mean_std_accuracy_results_100 = {}

for name, results in lle_accuracy_results.items():
    accuracies = [acc[2] for acc in results] 
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    lle_mean_std_accuracy_results_100[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"
    
lle_df_100 = pd.DataFrame([lle_mean_std_accuracy_results_100], index=['LLE'])
lle_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.43 ± 0.06,0.67 ± 0.01,0.61 ± 0.02,0.57 ± 0.02,0.01 ± 0.00,0.41 ± 0.07,0.56 ± 0.04,0.03 ± 0.01


# Spectral Embedding

In [31]:
n_neighbors_values = [3, 5, 7, 10]
n_components_values = [10]

transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, random_state=42)
        transformed_spectral_data = spectral.fit_transform(subset_train_reshaped)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))




In [32]:
spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels , test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        # Store accuracy along with parameters
        spectral_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")


In [33]:
spectral_mean_std_accuracy_results = {}
for name, results in spectral_accuracy_results.items():
    accuracies = [acc[2] for acc in results]  
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

spectral_df_10 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.22 ± 0.17,0.64 ± 0.03,0.60 ± 0.01,0.56 ± 0.03,0.01 ± 0.00,0.29 ± 0.02,0.64 ± 0.02,0.01 ± 0.00


In [34]:
n_neighbors_values = [3, 5, 7, 10]
n_components_values = [100]

transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, random_state=42)
        transformed_spectral_data = spectral.fit_transform(subset_train_reshaped)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))

spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels , test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        # Store accuracy along with parameters
        spectral_accuracy_results[name].append((n_neighbors_val, n_components_val, accuracy))
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

spectral_mean_std_accuracy_results = {}
for name, results in spectral_accuracy_results.items():
    accuracies = [acc[2] for acc in results]  
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

spectral_df_100 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_100





Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.36 ± 0.31,0.72 ± 0.01,0.67 ± 0.02,0.61 ± 0.02,0.01 ± 0.00,0.65 ± 0.03,0.71 ± 0.02,0.03 ± 0.01


In [35]:
combined_df_10 = pd.concat([no_embedding, mds_df_10, isomap_df_10, tsne_df, lle_df_10, spectral_df_10])
combined_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.75 ± 0.00,0.81 ± 0.00,0.69 ± 0.00,0.58 ± 0.00,0.84 ± 0.00,0.58 ± 0.00,0.80 ± 0.00,0.77 ± 0.00
MDS,0.64 ± 0.00,0.69 ± 0.00,0.62 ± 0.00,0.58 ± 0.00,0.72 ± 0.00,0.64 ± 0.00,0.63 ± 0.00,0.57 ± 0.00
Isomap,0.54 ± 0.03,0.68 ± 0.03,0.61 ± 0.01,0.56 ± 0.01,0.69 ± 0.02,0.49 ± 0.02,0.67 ± 0.01,0.57 ± 0.02
t-SNE,0.41 ± 0.07,0.78 ± 0.01,0.70 ± 0.02,0.63 ± 0.02,0.58 ± 0.04,0.34 ± 0.08,0.73 ± 0.05,0.45 ± 0.05
LLE,0.43 ± 0.06,0.67 ± 0.01,0.61 ± 0.02,0.57 ± 0.02,0.01 ± 0.00,0.41 ± 0.07,0.57 ± 0.03,0.03 ± 0.01
Spectral Embedding,0.22 ± 0.17,0.64 ± 0.03,0.60 ± 0.01,0.56 ± 0.03,0.01 ± 0.00,0.29 ± 0.02,0.64 ± 0.02,0.01 ± 0.00


In [36]:
combined_df_100 = pd.concat([no_embedding, mds_df_100, isomap_df_10, tsne_df, lle_df_100, spectral_df_100])
combined_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.75 ± 0.00,0.81 ± 0.00,0.69 ± 0.00,0.58 ± 0.00,0.84 ± 0.00,0.58 ± 0.00,0.80 ± 0.00,0.77 ± 0.00
MDS,0.66 ± 0.00,0.68 ± 0.00,0.58 ± 0.00,0.54 ± 0.00,0.76 ± 0.00,0.62 ± 0.00,0.62 ± 0.01,0.67 ± 0.00
Isomap,0.54 ± 0.03,0.68 ± 0.03,0.61 ± 0.01,0.56 ± 0.01,0.69 ± 0.02,0.49 ± 0.02,0.67 ± 0.01,0.57 ± 0.02
t-SNE,0.41 ± 0.07,0.78 ± 0.01,0.70 ± 0.02,0.63 ± 0.02,0.58 ± 0.04,0.34 ± 0.08,0.73 ± 0.05,0.45 ± 0.05
LLE,0.43 ± 0.06,0.67 ± 0.01,0.61 ± 0.02,0.57 ± 0.02,0.01 ± 0.00,0.41 ± 0.07,0.56 ± 0.04,0.03 ± 0.01
Spectral Embedding,0.36 ± 0.31,0.72 ± 0.01,0.67 ± 0.02,0.61 ± 0.02,0.01 ± 0.00,0.65 ± 0.03,0.71 ± 0.02,0.03 ± 0.01


In [38]:
from scipy.stats import ttest_ind
techniques_10 = combined_df_10.index.tolist()
techniques_100 = combined_df_100.index.tolist()

In [47]:
def perform_one_sided_ttest(no_embedding_means, technique_means):
    t_stat, p_value = ttest_ind(no_embedding_means, technique_means, equal_var=False,alternative ='less')
    return p_value

ttest_results_10 = {}
for technique in techniques_10:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_10.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_10.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_10[technique] = p_value

ttest_results_100 = {}
for technique in techniques_100:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_100.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_100.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_100[technique] = p_value

In [48]:
print("One-Sided T-Test Results for 10 Components:")
for technique, p_value in ttest_results_10.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")

print("\nOne-Sided T-Test Results for 100 Components:")
for technique, p_value in ttest_results_100.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")


One-Sided T-Test Results for 10 Components:
MDS: p-value = 0.9774
  No significant difference, fail to reject H0
Isomap: p-value = 0.9931
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9768
  No significant difference, fail to reject H0
LLE: p-value = 0.9949
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9963
  No significant difference, fail to reject H0

One-Sided T-Test Results for 100 Components:
MDS: p-value = 0.9661
  No significant difference, fail to reject H0
Isomap: p-value = 0.9931
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9768
  No significant difference, fail to reject H0
LLE: p-value = 0.9950
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9759
  No significant difference, fail to reject H0
