In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding

In [2]:
def load_fashion_mnist_from_csv(train_csv_path, test_csv_path):
    # Load training data
    train_df = pd.read_csv(train_csv_path)
    train_labels = train_df['label'].values
    train_pixels = train_df.drop('label', axis=1).values.reshape(-1, 784)

    # Load test data
    test_df = pd.read_csv(test_csv_path)
    test_labels = test_df['label'].values
    test_pixels = test_df.drop('label', axis=1).values.reshape(-1, 784)

    return (train_pixels, train_labels), (test_pixels, test_labels)

In [3]:
(train, train_label), (test, test_label) = load_fashion_mnist_from_csv('fashion-mnist_train.csv', 'fashion-mnist_test.csv')

In [4]:
classifiers = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    #'SVM Linear': SVC(kernel='linear'),
    #'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

In [5]:
classifiers_100 = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    'SVM Linear': SVC(kernel='linear'),
    'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

# No Embedding

In [6]:
num_samples = 1000
idx = np.random.choice(train.shape[0], num_samples, replace=False)
subset_train = train[idx]
subset_train_labels = train_label[idx]

In [7]:
subset_train_reshaped = subset_train.reshape(-1, 784)

In [10]:
#scaler = StandardScaler()
#train_scaled = scaler.fit_transform(subset_train)
X_train, X_test, y_train, y_test = train_test_split(subset_train, subset_train_labels, test_size=0.3, random_state=42)

In [13]:
subset_train.shape

(1000, 784)

In [11]:
X_train.shape

(700, 784)

In [9]:
no_embedding_accuracies = {}

for name, clf in classifiers_100.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    no_embedding_accuracies[name] = accuracy
    print(f"{name} - No Embedding: Accuracy = {accuracy:.2f}")

LDA - No Embedding: Accuracy = 0.40
KNN (K=1) - No Embedding: Accuracy = 0.76
KNN (K=3) - No Embedding: Accuracy = 0.72
KNN (K=5) - No Embedding: Accuracy = 0.68
SVM Linear - No Embedding: Accuracy = 0.82
SVM RBF - No Embedding: Accuracy = 0.74
Random Forest - No Embedding: Accuracy = 0.76
Multinomial Logistic Regression - No Embedding: Accuracy = 0.80


In [10]:
no_embedding_mean_std = {
    name: f"{np.mean(accs):.2f} ± {np.std(accs):.2f}"
    for name, accs in no_embedding_accuracies.items()}

In [11]:
no_embedding_df = pd.DataFrame([no_embedding_mean_std], index=['No Embedding'])
no_embedding_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.40 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.68 ± 0.00,0.82 ± 0.00,0.74 ± 0.00,0.76 ± 0.00,0.80 ± 0.00


# MDS 

In [12]:
metrics = [True]
eps_values = [0.001, 0.01, 0.1]
embeddings = {}
for metric_value in metrics:
    for eps in eps_values:
        embedding = MDS(n_components=10, metric=metric_value, eps=eps)
        train_transformed = embedding.fit_transform(subset_train_reshaped)
        embeddings[(metric_value, eps)] = train_transformed



In [13]:
mds_accuracy_results = {name: [] for name in classifiers.keys()}

for (metric_value, eps), train_transformed in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, subset_train_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append(accuracy)
        #print(f"{name} - MDS (metric={metric_value}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [15]:
mds_df_10 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
MDS,0.71 ± 0.01,0.69 ± 0.01,0.71 ± 0.00,0.71 ± 0.01,0.69 ± 0.01,0.72 ± 0.01


In [16]:
metrics = [True]
eps_values = [0.001, 0.01, 0.1]
embeddings = {}
for metric_value in metrics:
    for eps in eps_values:
        embedding = MDS(n_components=100, metric=metric_value, eps=eps)
        train_transformed = embedding.fit_transform(subset_train_reshaped)
        embeddings[(metric_value, eps)] = train_transformed

mds_accuracy_results = {name: [] for name in classifiers_100.keys()}

for (metric_value, eps), train_transformed in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, subset_train_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers_100.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append(accuracy)

mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

mds_df_100 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_100

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.72 ± 0.01,0.69 ± 0.01,0.72 ± 0.02,0.70 ± 0.01,0.73 ± 0.01,0.73 ± 0.01,0.71 ± 0.01,0.68 ± 0.01


# Iso Map 

In [17]:
#subset_train = train[:500]
#subset_train_labels = train_label[:500]
n_neighbors_values = [20, 30, 40, 50]
n_components_values = [10]

transformed_isomap_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_isomap_data = isomap.fit_transform(subset_train)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_isomap_data))

In [18]:
isomap_accuracy_results = {name: [] for name in classifiers.keys()}
for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

   
    for name in isomap_accuracy_results:
        isomap_accuracy_results[name].append(current_accuracies[name])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_10 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
Isomap,0.68 ± 0.01,0.69 ± 0.01,0.70 ± 0.02,0.71 ± 0.02,0.71 ± 0.01,0.71 ± 0.02


In [20]:
n_neighbors_values = [20, 30, 40, 50]
n_components_values = [100]

transformed_isomap_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_isomap_data = isomap.fit_transform(subset_train)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_isomap_data))

isomap_accuracy_results = {name: [] for name in classifiers_100.keys()}
for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}
    
    for name, clf in classifiers_100.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

   
    for name in isomap_accuracy_results:
        isomap_accuracy_results[name].append(current_accuracies[name])

isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_100 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.73 ± 0.02,0.70 ± 0.01,0.69 ± 0.01,0.70 ± 0.02,0.74 ± 0.01,0.73 ± 0.00,0.72 ± 0.01,0.68 ± 0.02


# T-SNE

In [21]:
n_perplexity_values = [1, 3, 5, 7]
n_components_values = [2, 3]

transformed_tsne_data_list = []
for perplexity_val in n_perplexity_values:
    for i, n_components_val in enumerate(n_components_values):
        tsne = TSNE(n_components=n_components_val, perplexity=perplexity_val, random_state=42)
        transformed_tsne_data = tsne.fit_transform(subset_train)
        transformed_tsne_data_list.append((perplexity_val, n_components_val, transformed_tsne_data))
        

In [22]:
tsne_accuracy_results = {name: [] for name in classifiers_100.keys()}

for perplexity_val, n_components_val, transformed_data in transformed_tsne_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}
    
    for name, clf in classifiers_100.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy

        #print(f"{name} with t-SNE (perplexity={perplexity_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

    for name in tsne_accuracy_results:
        tsne_accuracy_results[name].append(current_accuracies[name])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
tsne_mean_std_accuracy_results = {}
for name, acc_list in tsne_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    tsne_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

tsne_df = pd.DataFrame([tsne_mean_std_accuracy_results], index=['t-SNE'])
tsne_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
t-SNE,0.58 ± 0.07,0.71 ± 0.03,0.72 ± 0.04,0.72 ± 0.03,0.60 ± 0.09,0.60 ± 0.08,0.73 ± 0.03,0.59 ± 0.08


# LLE 

In [24]:
n_neighbors_values = [ 2, 3, 5, 7]
n_components_values = [10]

transformed_lle_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        lle = LocallyLinearEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_lle_data = lle.fit_transform(subset_train)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_lle_data))     

In [25]:
lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

    for name in lle_accuracy_results:
        lle_accuracy_results[name].append(current_accuracies[name])

In [26]:
lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

lle_df_10 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
LLE,0.58 ± 0.08,0.67 ± 0.01,0.70 ± 0.02,0.69 ± 0.02,0.69 ± 0.03,0.51 ± 0.09


In [27]:
n_neighbors_values = [ 2, 3, 5, 7]
n_components_values = [100]

transformed_lle_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        lle = LocallyLinearEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_lle_data = lle.fit_transform(subset_train)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_lle_data))     

lle_accuracy_results = {name: [] for name in classifiers_100.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}

    for name, clf in classifiers_100.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

    for name in lle_accuracy_results:
        lle_accuracy_results[name].append(current_accuracies[name])

lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

lle_df_100 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.75 ± 0.03,0.70 ± 0.03,0.72 ± 0.02,0.73 ± 0.02,0.51 ± 0.03,0.74 ± 0.02,0.74 ± 0.01,0.66 ± 0.02


# Spectral Embedding

In [28]:
n_neighbors_values = [ 3, 5, 7,10]
n_components_values = [10]

transformed_spectral_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_spectral_data = spectral.fit_transform(subset_train)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))



In [29]:
spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

    for name in spectral_accuracy_results:
        spectral_accuracy_results[name].append(current_accuracies[name])

In [30]:
spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [31]:
spectral_df_10 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.64 ± 0.01,0.65 ± 0.03,0.68 ± 0.03,0.69 ± 0.02,0.69 ± 0.02,0.31 ± 0.12


In [32]:
n_neighbors_values = [ 3, 5, 7,10]
n_components_values = [100]

transformed_spectral_data_list = []
for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_spectral_data = spectral.fit_transform(subset_train)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))

spectral_accuracy_results = {name: [] for name in classifiers_100.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, subset_train_labels, test_size=0.3, random_state=42)
    current_accuracies = {}

    for name, clf in classifiers_100.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        current_accuracies[name] = accuracy
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")

    for name in spectral_accuracy_results:
        spectral_accuracy_results[name].append(current_accuracies[name])

spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

spectral_df_100 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_100



Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.73 ± 0.02,0.69 ± 0.01,0.72 ± 0.02,0.72 ± 0.01,0.12 ± 0.00,0.72 ± 0.01,0.73 ± 0.01,0.34 ± 0.14


In [33]:
combined_df_10 = pd.concat([no_embedding_df, mds_df_10, isomap_df_10, tsne_df, lle_df_10, spectral_df_10])
combined_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.40 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.68 ± 0.00,0.82 ± 0.00,0.74 ± 0.00,0.76 ± 0.00,0.80 ± 0.00
MDS,0.71 ± 0.01,0.69 ± 0.01,0.71 ± 0.00,0.71 ± 0.01,,,0.69 ± 0.01,0.72 ± 0.01
Isomap,0.68 ± 0.01,0.69 ± 0.01,0.70 ± 0.02,0.71 ± 0.02,,,0.71 ± 0.01,0.71 ± 0.02
t-SNE,0.58 ± 0.07,0.71 ± 0.03,0.72 ± 0.04,0.72 ± 0.03,0.60 ± 0.09,0.60 ± 0.08,0.73 ± 0.03,0.59 ± 0.08
LLE,0.58 ± 0.08,0.67 ± 0.01,0.70 ± 0.02,0.69 ± 0.02,,,0.69 ± 0.03,0.51 ± 0.09
Spectral Embedding,0.64 ± 0.01,0.65 ± 0.03,0.68 ± 0.03,0.69 ± 0.02,,,0.69 ± 0.02,0.31 ± 0.12


In [35]:
combined_df_100 = pd.concat([no_embedding_df, mds_df_100, isomap_df_100, tsne_df, lle_df_100, spectral_df_100])
combined_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.40 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.68 ± 0.00,0.82 ± 0.00,0.74 ± 0.00,0.76 ± 0.00,0.80 ± 0.00
MDS,0.72 ± 0.01,0.69 ± 0.01,0.72 ± 0.02,0.70 ± 0.01,0.73 ± 0.01,0.73 ± 0.01,0.71 ± 0.01,0.68 ± 0.01
Isomap,0.73 ± 0.02,0.70 ± 0.01,0.69 ± 0.01,0.70 ± 0.02,0.74 ± 0.01,0.73 ± 0.00,0.72 ± 0.01,0.68 ± 0.02
t-SNE,0.58 ± 0.07,0.71 ± 0.03,0.72 ± 0.04,0.72 ± 0.03,0.60 ± 0.09,0.60 ± 0.08,0.73 ± 0.03,0.59 ± 0.08
LLE,0.75 ± 0.03,0.70 ± 0.03,0.72 ± 0.02,0.73 ± 0.02,0.51 ± 0.03,0.74 ± 0.02,0.74 ± 0.01,0.66 ± 0.02
Spectral Embedding,0.73 ± 0.02,0.69 ± 0.01,0.72 ± 0.02,0.72 ± 0.01,0.12 ± 0.00,0.72 ± 0.01,0.73 ± 0.01,0.34 ± 0.14


In [34]:
combined_df_10 = combined_df_10.drop(['SVM Linear', 'SVM RBF'], axis=1)
combined_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
No Embedding,0.40 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.68 ± 0.00,0.76 ± 0.00,0.80 ± 0.00
MDS,0.71 ± 0.01,0.69 ± 0.01,0.71 ± 0.00,0.71 ± 0.01,0.69 ± 0.01,0.72 ± 0.01
Isomap,0.68 ± 0.01,0.69 ± 0.01,0.70 ± 0.02,0.71 ± 0.02,0.71 ± 0.01,0.71 ± 0.02
t-SNE,0.58 ± 0.07,0.71 ± 0.03,0.72 ± 0.04,0.72 ± 0.03,0.73 ± 0.03,0.59 ± 0.08
LLE,0.58 ± 0.08,0.67 ± 0.01,0.70 ± 0.02,0.69 ± 0.02,0.69 ± 0.03,0.51 ± 0.09
Spectral Embedding,0.64 ± 0.01,0.65 ± 0.03,0.68 ± 0.03,0.69 ± 0.02,0.69 ± 0.02,0.31 ± 0.12


In [36]:
from scipy.stats import ttest_ind
techniques_10 = combined_df_10.index.tolist()
techniques_100 = combined_df_100.index.tolist()

In [44]:
def perform_one_sided_ttest(no_embedding_means, technique_means):
    t_stat, p_value = ttest_ind( technique_means,no_embedding_means, equal_var=False, alternative = 'less')
    #if t_stat > 0:
        #p_value /= 2
    #else:
        #p_value = 1 - (p_value / 2)
    return p_value

ttest_results_10 = {}
for technique in techniques_10:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_10.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_10.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_10[technique] = p_value

ttest_results_100 = {}
for technique in techniques_100:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_100.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_100.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_100[technique] = p_value

In [45]:
print("One-Sided T-Test Results for 10 Components:")
for technique, p_value in ttest_results_10.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")

print("\nOne-Sided T-Test Results for 100 Components:")
for technique, p_value in ttest_results_100.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")


One-Sided T-Test Results for 10 Components:
MDS: p-value = 0.6141
  No significant difference, fail to reject H0
Isomap: p-value = 0.5837
  No significant difference, fail to reject H0
t-SNE: p-value = 0.4325
  No significant difference, fail to reject H0
LLE: p-value = 0.2551
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.1943
  No significant difference, fail to reject H0

One-Sided T-Test Results for 100 Components:
MDS: p-value = 0.5000
  No significant difference, fail to reject H0
Isomap: p-value = 0.5101
  No significant difference, fail to reject H0
t-SNE: p-value = 0.1658
  No significant difference, fail to reject H0
LLE: p-value = 0.3859
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.1282
  No significant difference, fail to reject H0
