In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import cv2
import os
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding

In [3]:
def convert_to_grayscale(img):
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
def resize_image(img, dim=(224, 224)):
    return cv2.resize(img, dim, interpolation=cv2.INTER_AREA)

In [4]:
input_folder = "/Users/thienphuong/Desktop/DATA-4381-Capstone-Project/dog-breeds/"
output_folder = "/Users/thienphuong/Desktop/DATA-4381-Capstone-Project/dog-breeds/processed_images"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [5]:
image_files = []
images_for_mds = []
image_labels = []

breed_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

# Loop through each breed input folder
for breed_folder in breed_folders:
    breed_path = os.path.join(input_folder, breed_folder)
    image_extensions = ['.jpg', '.png', '.jpeg']
    
    breed_image_files = [f for f in os.listdir(breed_path) if any(f.lower().endswith(ext) for ext in image_extensions)]
    
    for image_file in tqdm(breed_image_files):
        img_path = os.path.join(breed_path, image_file)
        img = cv2.imread(img_path)
        
        if img is None:
            print(f"Failed to load {img_path}")
            continue

        # Populate the image_files list with tuples
        image_files.append((breed_folder, image_file))
        
        # Convert to grayscale and resize
        gray_img = convert_to_grayscale(img)
        resized_gray_img = resize_image(gray_img)
        
        #print(resized_gray_img.shape)
        
        # Save the processed image
        breed_output_folder = os.path.join(output_folder, breed_folder)
        if not os.path.exists(breed_output_folder):
            os.makedirs(breed_output_folder)
        
        output_path = os.path.join(breed_output_folder, image_file)
        cv2.imwrite(output_path, resized_gray_img)
        
        # Preparing data for MDS on color images (resize color images)
        #color_resized_img = resize_image(img)
        images_for_mds.append(resized_gray_img.reshape(-1))
        image_labels.append(breed_folder)  # adding the breed label for each image
        

100%|██████████████████████████████████████████| 78/78 [00:00<00:00, 114.41it/s]
100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 102.27it/s]
100%|███████████████████████████████████████████| 76/76 [00:00<00:00, 89.83it/s]
100%|███████████████████████████████████████████| 83/83 [00:00<00:00, 99.81it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|███████████████████████████████████████████| 71/71 [00:00<00:00, 98.74it/s]
100%|███████████████████████████████████████████| 56/56 [00:00<00:00, 83.43it/s]
100%|██████████████████████████████████████████| 76/76 [00:00<00:00, 116.33it/s]
0it [00:00, ?it/s]
100%|███████████████████████████████████████████| 50/50 [00:00<00:00, 99.87it/s]


In [6]:
label_to_color = {
    'rottweiler': 'r',
    'dalmatian': 'g',
    'german-shepherd': 'b',
    'beagle': 'c',
    'husky': 'm',
    'poodle': 'y',
    'bulldog': 'k',
    'labrador-retriever': 'orange',
}

In [7]:
classifiers = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    'SVM Linear': SVC(kernel='linear'),
    'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

In [8]:
classifiers_10 = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    #'SVM Linear': SVC(kernel='linear'),
    #'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

In [9]:
X = np.array(images_for_mds)
y = np.array(image_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

no_embedding_accuracies = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    no_embedding_accuracies[name] = [accuracy]  # Store accuracies in a list for potential multiple runs


no_embedding_accuracies_mean_std = {
    name: f"{np.mean(values):.2f} ± {np.std(values):.2f}" for name, values in no_embedding_accuracies.items()
}


for classifier, acc_stat in no_embedding_accuracies_mean_std.items():
    print(f"{classifier} - No Embedding: Accuracy = {acc_stat}")

LDA - No Embedding: Accuracy = 0.61 ± 0.00
KNN (K=1) - No Embedding: Accuracy = 0.70 ± 0.00
KNN (K=3) - No Embedding: Accuracy = 0.47 ± 0.00
KNN (K=5) - No Embedding: Accuracy = 0.33 ± 0.00
SVM Linear - No Embedding: Accuracy = 0.76 ± 0.00
SVM RBF - No Embedding: Accuracy = 0.72 ± 0.00
Random Forest - No Embedding: Accuracy = 0.73 ± 0.00
Multinomial Logistic Regression - No Embedding: Accuracy = 0.76 ± 0.00


In [10]:
no_embedding_df = pd.DataFrame([no_embedding_accuracies_mean_std], index=['No Embedding'])
no_embedding_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.61 ± 0.00,0.70 ± 0.00,0.47 ± 0.00,0.33 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.73 ± 0.00,0.76 ± 0.00


# 10 components

## MDS 

In [11]:
transformed_data_list = []
eps_values = [1, 1E-1, 1E-2]
n_components = 10  

for eps_val in eps_values:
    mds = MDS(n_components=n_components, metric=True, eps=eps_val)
    transformed_data = mds.fit_transform(images_for_mds)
    for metric_val in [True, False]:
        transformed_data_list.append((metric_val, eps_val, transformed_data))



In [12]:
mds_accuracy_results = {name: [] for name in classifiers_10.keys()}
for (metric_value, eps, train_transformed) in transformed_data_list:
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers_10.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append((metric_value, eps, accuracy))
       # print(f"{name} - MDS (metric={metric_value}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

In [13]:
mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    accuracies = [acc[-1] for acc in acc_list]  # Extract only the accuracy values
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [14]:
mds_df_10 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
MDS,0.28 ± 0.02,0.68 ± 0.01,0.47 ± 0.01,0.32 ± 0.00,0.70 ± 0.02,0.29 ± 0.01


In [15]:
transformed_data_list = []
eps_values = [1, 1E-1, 1E-2]
n_components = 100  

for eps_val in eps_values:
    mds = MDS(n_components=n_components, metric=True, eps=eps_val)
    transformed_data = mds.fit_transform(images_for_mds)
    for metric_val in [True, False]:
        transformed_data_list.append((metric_val, eps_val, transformed_data))

mds_accuracy_results = {name: [] for name in classifiers.keys()}
for (metric_value, eps, train_transformed) in transformed_data_list:
    X_train, X_test, y_train, y_test = train_test_split(train_transformed, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mds_accuracy_results[name].append((metric_value, eps, accuracy))
       # print(f"{name} - MDS (metric={metric_value}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    accuracies = [acc[-1] for acc in acc_list]  # Extract only the accuracy values
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

mds_df_100 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_100



Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.51 ± 0.01,0.69 ± 0.01,0.47 ± 0.02,0.32 ± 0.01,0.69 ± 0.02,0.67 ± 0.01,0.70 ± 0.02,0.57 ± 0.02


# Iso-map 

In [16]:
transformed_isomap_data_list = []

n_neighbors_values = [10, 20, 30, 40]
n_components_values = [10]

for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        
        # Isomap
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_isomap_data = isomap.fit_transform(images_for_mds)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_isomap_data))
 

In [17]:
isomap_accuracy_results = {name: [] for name in classifiers_10.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers_10.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        isomap_accuracy_results[name].append(accuracy)
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

In [18]:
isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_10 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
Isomap,0.26 ± 0.02,0.69 ± 0.03,0.45 ± 0.02,0.33 ± 0.05,0.71 ± 0.02,0.26 ± 0.05


In [19]:
transformed_isomap_data_list = []

n_neighbors_values = [10, 20, 30, 40]
n_components_values = [100]

for n_neighbors_val in n_neighbors_values:
    for i, n_components_val in enumerate(n_components_values):
        
        # Isomap
        isomap = Isomap(n_neighbors=n_neighbors_val, n_components=n_components_val)
        transformed_isomap_data = isomap.fit_transform(images_for_mds)
        transformed_isomap_data_list.append((n_neighbors_val, n_components_val, transformed_isomap_data))

isomap_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        isomap_accuracy_results[name].append(accuracy)
 
isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_100 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_100
 

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.54 ± 0.03,0.70 ± 0.03,0.46 ± 0.02,0.30 ± 0.02,0.69 ± 0.01,0.60 ± 0.07,0.70 ± 0.02,0.67 ± 0.01


In [20]:
X = np.array(images_for_mds)
n_perplexity_values = [10, 30, 50, 100]
n_components_values = [3]

transformed_tsne_data_list = []

for perplexity_val in n_perplexity_values:
    for n_components_val in n_components_values:
        tsne = TSNE(n_components=n_components_val, perplexity=perplexity_val, random_state=42)
        X_tsne = tsne.fit_transform(X)
        transformed_tsne_data_list.append((perplexity_val, n_components_val, X_tsne))

In [21]:
tsne_accuracy_results = {name: [] for name in classifiers.keys()}

for perplexity_val, n_components_val, transformed_data in transformed_tsne_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        tsne_accuracy_results[name].append(accuracy)
        #print(f"{name} with t-SNE (perplexity={perplexity_val}, n_components={n_components_val}) - Accuracy: {accuracy:.2f}")


In [22]:
tsne_mean_std_accuracy_results = {}
for name, acc_list in tsne_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    tsne_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

tsne_df_10 = pd.DataFrame([tsne_mean_std_accuracy_results], index=['t-SNE'])
tsne_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
t-SNE,0.20 ± 0.03,0.72 ± 0.02,0.50 ± 0.01,0.38 ± 0.01,0.19 ± 0.00,0.25 ± 0.01,0.60 ± 0.03,0.20 ± 0.03


In [23]:
n_neighbors_values = [1, 2, 4, 5, 7]
n_components_values = [10]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        lle = LocallyLinearEmbedding(n_components=n_components_val, n_neighbors=n_neighbors_val, eigen_solver='dense', random_state=None)
        X_lle = lle.fit_transform(X)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, X_lle))

In [24]:
lle_accuracy_results = {name: [] for name in classifiers_10.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers_10.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        lle_accuracy_results[name].append(accuracy)
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}")


In [25]:
lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [26]:
lle_df_10 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
LLE,0.22 ± 0.03,0.65 ± 0.05,0.46 ± 0.07,0.34 ± 0.09,0.62 ± 0.07,0.16 ± 0.01


In [27]:
n_neighbors_values = [1, 2, 4, 5, 7]
n_components_values = [100]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        lle = LocallyLinearEmbedding(n_components=n_components_val, n_neighbors=n_neighbors_val, eigen_solver='dense', random_state=None)
        X_lle = lle.fit_transform(X)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, X_lle))

lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        lle_accuracy_results[name].append(accuracy)
        #print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}")

lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

lle_df_100 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.63 ± 0.01,0.70 ± 0.02,0.47 ± 0.03,0.33 ± 0.07,0.26 ± 0.06,0.63 ± 0.02,0.68 ± 0.02,0.45 ± 0.03


In [28]:
n_neighbors_values = [5, 10, 15, 20]  
n_components_values = [10] 
transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        spectral = SpectralEmbedding(n_components=n_components_val, n_neighbors=n_neighbors_val, random_state=42)
        X_spectral = spectral.fit_transform(X)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, X_spectral))




In [29]:
spectral_accuracy_results = {name: [] for name in classifiers_10.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers_10.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        spectral_accuracy_results[name].append(accuracy)
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}")

In [30]:
spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [31]:
spectral_df_10 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.23 ± 0.02,0.67 ± 0.04,0.45 ± 0.01,0.33 ± 0.01,0.65 ± 0.07,0.14 ± 0.00


In [32]:
n_neighbors_values = [5, 10, 15, 20]  
n_components_values = [100] 
transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        spectral = SpectralEmbedding(n_components=n_components_val, n_neighbors=n_neighbors_val, random_state=42)
        X_spectral = spectral.fit_transform(X)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, X_spectral))

spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train, X_test, y_train, y_test = train_test_split(transformed_data, image_labels, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        spectral_accuracy_results[name].append(accuracy)
        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}")

spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

spectral_df_100 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_100




Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.61 ± 0.03,0.71 ± 0.02,0.49 ± 0.02,0.37 ± 0.03,0.14 ± 0.00,0.64 ± 0.03,0.71 ± 0.02,0.19 ± 0.08


In [33]:
combined_df_10 = pd.concat([no_embedding_df, mds_df_10, isomap_df_10, tsne_df_10, lle_df_10, spectral_df_10])
combined_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.61 ± 0.00,0.70 ± 0.00,0.47 ± 0.00,0.33 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.73 ± 0.00,0.76 ± 0.00
MDS,0.28 ± 0.02,0.68 ± 0.01,0.47 ± 0.01,0.32 ± 0.00,,,0.70 ± 0.02,0.29 ± 0.01
Isomap,0.26 ± 0.02,0.69 ± 0.03,0.45 ± 0.02,0.33 ± 0.05,,,0.71 ± 0.02,0.26 ± 0.05
t-SNE,0.20 ± 0.03,0.72 ± 0.02,0.50 ± 0.01,0.38 ± 0.01,0.19 ± 0.00,0.25 ± 0.01,0.60 ± 0.03,0.20 ± 0.03
LLE,0.22 ± 0.03,0.65 ± 0.05,0.46 ± 0.07,0.34 ± 0.09,,,0.62 ± 0.07,0.16 ± 0.01
Spectral Embedding,0.23 ± 0.02,0.67 ± 0.04,0.45 ± 0.01,0.33 ± 0.01,,,0.65 ± 0.07,0.14 ± 0.00


In [34]:
combined_df_100 = pd.concat([no_embedding_df, mds_df_100, isomap_df_100, tsne_df_10, lle_df_100, spectral_df_100])
combined_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,0.61 ± 0.00,0.70 ± 0.00,0.47 ± 0.00,0.33 ± 0.00,0.76 ± 0.00,0.72 ± 0.00,0.73 ± 0.00,0.76 ± 0.00
MDS,0.51 ± 0.01,0.69 ± 0.01,0.47 ± 0.02,0.32 ± 0.01,0.69 ± 0.02,0.67 ± 0.01,0.70 ± 0.02,0.57 ± 0.02
Isomap,0.54 ± 0.03,0.70 ± 0.03,0.46 ± 0.02,0.30 ± 0.02,0.69 ± 0.01,0.60 ± 0.07,0.70 ± 0.02,0.67 ± 0.01
t-SNE,0.20 ± 0.03,0.72 ± 0.02,0.50 ± 0.01,0.38 ± 0.01,0.19 ± 0.00,0.25 ± 0.01,0.60 ± 0.03,0.20 ± 0.03
LLE,0.63 ± 0.01,0.70 ± 0.02,0.47 ± 0.03,0.33 ± 0.07,0.26 ± 0.06,0.63 ± 0.02,0.68 ± 0.02,0.45 ± 0.03
Spectral Embedding,0.61 ± 0.03,0.71 ± 0.02,0.49 ± 0.02,0.37 ± 0.03,0.14 ± 0.00,0.64 ± 0.03,0.71 ± 0.02,0.19 ± 0.08


In [35]:
combined_df_10 = combined_df_10.drop(['SVM Linear', 'SVM RBF'], axis=1)# Remove NaN to conduct t-test 

In [36]:
from scipy.stats import ttest_ind
techniques_10 = combined_df_10.index.tolist()
techniques_100 = combined_df_100.index.tolist()

In [37]:
def perform_one_sided_ttest(no_embedding_means, technique_means):
    t_stat, p_value = ttest_ind(no_embedding_means, technique_means, equal_var=False,alternative ='less')
   # if t_stat > 0:
        #p_value /= 2
    #else:
        #p_value = 1 - (p_value / 2)
    return p_value

ttest_results_10 = {}
for technique in techniques_10:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_10.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_10.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_10[technique] = p_value

ttest_results_100 = {}
for technique in techniques_100:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_100.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_100.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(no_embedding_means, technique_means)
    ttest_results_100[technique] = p_value

In [38]:
print("One-Sided T-Test Results for 10 Components:")
for technique, p_value in ttest_results_10.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")

print("\nOne-Sided T-Test Results for 100 Components:")
for technique, p_value in ttest_results_100.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")

One-Sided T-Test Results for 10 Components:
MDS: p-value = 0.8991
  No significant difference, fail to reject H0
Isomap: p-value = 0.9007
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9174
  No significant difference, fail to reject H0
LLE: p-value = 0.9463
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9362
  No significant difference, fail to reject H0

One-Sided T-Test Results for 100 Components:
MDS: p-value = 0.7758
  No significant difference, fail to reject H0
Isomap: p-value = 0.7518
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9923
  No significant difference, fail to reject H0
LLE: p-value = 0.9138
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9285
  No significant difference, fail to reject H0
