In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding

In [2]:
#Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
classifiers = {
    'LDA': LinearDiscriminantAnalysis(),
    'KNN (K=1)': KNeighborsClassifier(n_neighbors=1),
    'KNN (K=3)': KNeighborsClassifier(n_neighbors=3),
    'KNN (K=5)': KNeighborsClassifier(n_neighbors=5),
    'SVM Linear': SVC(kernel='linear'),
    'SVM RBF': SVC(kernel='rbf'),
    'Random Forest': RandomForestClassifier(),
    'Multinomial Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000) 
}

# No Embedding

In [4]:
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values
X_test = test_data.values

n_samples = 500
X_subset = X_train[:n_samples]
y_subset = y_train[:n_samples]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_subset)
X_test_scaled = scaler.transform(X_test)

In [5]:
X_train_split, X_test, y_train_split, y_test = train_test_split(X_subset, y_subset, test_size=0.3, random_state=42)

In [6]:
no_embedding_accuracies = {}

for name, clf in classifiers.items():
    clf.fit(X_subset, y_subset)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    no_embedding_accuracies[name] = accuracy
    print(f"{name} - No Embedding: Accuracy = {accuracy*100:.2f}%")

LDA - No Embedding: Accuracy = 100.00%
KNN (K=1) - No Embedding: Accuracy = 100.00%
KNN (K=3) - No Embedding: Accuracy = 93.33%
KNN (K=5) - No Embedding: Accuracy = 92.67%
SVM Linear - No Embedding: Accuracy = 100.00%
SVM RBF - No Embedding: Accuracy = 97.33%
Random Forest - No Embedding: Accuracy = 100.00%
Multinomial Logistic Regression - No Embedding: Accuracy = 100.00%


In [7]:
no_embedding_mean_std = {
    name: f"{np.mean(accs):.2f} ± {np.std(accs):.2f}"
    for name, accs in no_embedding_accuracies.items()}

In [8]:
no_embedding_df = pd.DataFrame([no_embedding_mean_std], index=['No Embedding'])
no_embedding_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,1.00 ± 0.00,1.00 ± 0.00,0.93 ± 0.00,0.93 ± 0.00,1.00 ± 0.00,0.97 ± 0.00,1.00 ± 0.00,1.00 ± 0.00


# MDS

In [9]:
metrics = [True]
eps_values = [0.001, 0.01, 0.1]
embeddings = {}

for metric_value in metrics:
    for eps in eps_values:
        embedding = MDS(n_components=10, metric=metric_value, eps=eps)
        train_transformed = embedding.fit_transform(X_train_scaled)
        embeddings[(metric_value, eps)] = train_transformed



In [10]:
mds_accuracy_results = {name: [] for name in classifiers.keys()}
for (metric_value, eps), train_transformed in embeddings.items():
    X_train_transformed, X_test_transformed, y_train_subset, y_test_subset = train_test_split(
        train_transformed, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_transformed, y_train_subset)
        y_pred = clf.predict(X_test_transformed)
        accuracy = accuracy_score(y_test_subset, y_pred)
        mds_accuracy_results[name].append(accuracy)
       # print(f"{name} - MDS (metric={metric_value}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

In [11]:
mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

mds_df_10 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.73 ± 0.01,0.77 ± 0.00,0.75 ± 0.03,0.76 ± 0.00,0.73 ± 0.02,0.80 ± 0.01,0.75 ± 0.01,0.74 ± 0.04


In [12]:
metrics = [True]
eps_values = [0.001, 0.01, 0.1]
embeddings = {}

for metric_value in metrics:
    for eps in eps_values:
        embedding = MDS(n_components=100, metric=metric_value, eps=eps)
        train_transformed = embedding.fit_transform(X_train_scaled)
        embeddings[(metric_value, eps)] = train_transformed

mds_accuracy_results = {name: [] for name in classifiers.keys()}
for (metric_value, eps), train_transformed in embeddings.items():
    X_train_transformed, X_test_transformed, y_train_subset, y_test_subset = train_test_split(
        train_transformed, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_transformed, y_train_subset)
        y_pred = clf.predict(X_test_transformed)
        accuracy = accuracy_score(y_test_subset, y_pred)
        mds_accuracy_results[name].append(accuracy)
       # print(f"{name} - MDS (metric={metric_value}, eps={eps}): Accuracy = {accuracy*100:.2f}%")

mds_mean_std_accuracy_results = {}
for name, acc_list in mds_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    mds_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

mds_df_100 = pd.DataFrame([mds_mean_std_accuracy_results], index=['MDS'])
mds_df_100



Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
MDS,0.73 ± 0.07,0.73 ± 0.03,0.72 ± 0.06,0.72 ± 0.04,0.81 ± 0.05,0.84 ± 0.02,0.72 ± 0.06,0.78 ± 0.08


# Iso Map

In [13]:
n_neighbors_values = [10, 20, 30, 40]
n_components_values = [10]
embeddings = {}

transformed_isomap_data_list = []
for n_neighbors in n_neighbors_values:
    for n_components in n_components_values:
        iso = Isomap(n_neighbors=n_neighbors, n_components=n_components)
        subset_train_transformed = iso.fit_transform(X_train_scaled)  
        embeddings[(n_neighbors, n_components)] = subset_train_transformed
        transformed_isomap_data_list.append((n_neighbors, n_components, subset_train_transformed))

In [14]:
isomap_accuracy_results = {name: [] for name in classifiers.keys()}
for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split)
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)
        isomap_accuracy_results[name].append(accuracy)
        #print(f"{name} with Isomap (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

In [15]:
isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_10 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.77 ± 0.02,0.75 ± 0.01,0.75 ± 0.01,0.76 ± 0.01,0.76 ± 0.01,0.81 ± 0.02,0.80 ± 0.03,0.75 ± 0.01


In [16]:
n_neighbors_values = [10, 20, 30, 40]
n_components_values = [100]
embeddings = {}

transformed_isomap_data_list = []
for n_neighbors in n_neighbors_values:
    for n_components in n_components_values:
        iso = Isomap(n_neighbors=n_neighbors, n_components=n_components)
        subset_train_transformed = iso.fit_transform(X_train_scaled)  
        embeddings[(n_neighbors, n_components)] = subset_train_transformed
        transformed_isomap_data_list.append((n_neighbors, n_components, subset_train_transformed))

isomap_accuracy_results = {name: [] for name in classifiers.keys()}
for n_neighbors_val, n_components_val, transformed_data in transformed_isomap_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split)
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)
        isomap_accuracy_results[name].append(accuracy)

isomap_mean_std_accuracy_results = {}
for name, acc_list in isomap_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    isomap_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

isomap_df_100 = pd.DataFrame([isomap_mean_std_accuracy_results], index=['Isomap'])
isomap_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Isomap,0.74 ± 0.01,0.71 ± 0.02,0.67 ± 0.03,0.67 ± 0.04,0.79 ± 0.03,0.84 ± 0.01,0.78 ± 0.01,0.78 ± 0.02


# T-SNE

In [17]:
n_perplexity_values = [1, 3, 5, 7]
n_components_values = [2, 3]

transformed_tsne_data_list = []

for perplexity_val in n_perplexity_values:
    for n_components_val in n_components_values:
        tsne = TSNE(n_components=n_components_val, perplexity=perplexity_val, random_state=42)
        transformed_tsne_data = tsne.fit_transform(X_train_scaled)
        transformed_tsne_data_list.append((perplexity_val, n_components_val, transformed_tsne_data))

In [18]:
tsne_accuracy_results = {name: [] for name in classifiers.keys()}

for perplexity_val, n_components_val, transformed_data in transformed_tsne_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split) 
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)  
        tsne_accuracy_results[name].append(accuracy)  

        #print(f"{name} with t-SNE (perplexity={perplexity_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

In [19]:
tsne_mean_std_accuracy_results = {}
for name, acc_list in tsne_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    tsne_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

tsne_df = pd.DataFrame([tsne_mean_std_accuracy_results], index=['t-SNE'])
tsne_df

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
t-SNE,0.43 ± 0.15,0.79 ± 0.02,0.72 ± 0.04,0.71 ± 0.08,0.51 ± 0.16,0.64 ± 0.18,0.79 ± 0.03,0.45 ± 0.16


# LLE

In [20]:
n_neighbors_values = [ 3, 5, 7, 10]
n_components_values = [10]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values: 
        lle = LocallyLinearEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, eigen_solver='dense', random_state=42)
        transformed_lle_data = lle.fit_transform(X_train_scaled)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_lle_data))

In [21]:
lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split) 
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)  
        lle_accuracy_results[name].append(accuracy)  # Correct variable

       # print(f"{name} with LLE (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")

In [22]:
lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

lle_df_10 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.59 ± 0.06,0.70 ± 0.03,0.67 ± 0.03,0.65 ± 0.05,0.08 ± 0.01,0.64 ± 0.08,0.72 ± 0.02,0.39 ± 0.03


In [23]:
n_neighbors_values = [ 3, 5, 7, 10]
n_components_values = [100]

transformed_lle_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values: 
        lle = LocallyLinearEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, eigen_solver='dense', random_state=42)
        transformed_lle_data = lle.fit_transform(X_train_scaled)
        transformed_lle_data_list.append((n_neighbors_val, n_components_val, transformed_lle_data))
        
lle_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_lle_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split) 
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)  
        lle_accuracy_results[name].append(accuracy) 

lle_mean_std_accuracy_results = {}
for name, acc_list in lle_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    lle_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

lle_df_100 = pd.DataFrame([lle_mean_std_accuracy_results], index=['LLE'])
lle_df_100


Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
LLE,0.84 ± 0.02,0.75 ± 0.00,0.74 ± 0.01,0.75 ± 0.02,0.41 ± 0.00,0.82 ± 0.02,0.81 ± 0.01,0.65 ± 0.02


# Spectral 

In [24]:
n_neighbors_values = [ 3, 5, 7, 10]
n_components_values = [10]

transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, random_state=42)
        transformed_spectral_data = spectral.fit_transform(X_subset)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))



In [25]:
spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split)
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)
        spectral_accuracy_results[name].append(accuracy)  # Use the correct variable for storing results

        #print(f"{name} with Spectral Embedding (n_neighbors={n_neighbors_val}, n_components={n_components_val}) - Accuracy: {accuracy*100:.2f}%")


In [26]:
spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"

In [27]:
spectral_df_10 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.70 ± 0.06,0.79 ± 0.02,0.81 ± 0.03,0.79 ± 0.04,0.07 ± 0.00,0.79 ± 0.06,0.81 ± 0.03,0.16 ± 0.07


In [28]:
n_neighbors_values = [ 3, 5, 7, 10]
n_components_values = [10]

transformed_spectral_data_list = []

for n_neighbors_val in n_neighbors_values:
    for n_components_val in n_components_values:
        
        spectral = SpectralEmbedding(n_neighbors=n_neighbors_val, n_components=n_components_val, random_state=42)
        transformed_spectral_data = spectral.fit_transform(X_subset)
        transformed_spectral_data_list.append((n_neighbors_val, n_components_val, transformed_spectral_data))

spectral_accuracy_results = {name: [] for name in classifiers.keys()}

for n_neighbors_val, n_components_val, transformed_data in transformed_spectral_data_list:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(transformed_data, y_subset, test_size=0.3, random_state=42)
    
    for name, clf in classifiers.items():
        clf.fit(X_train_split, y_train_split)
        y_pred = clf.predict(X_test_split)
        accuracy = accuracy_score(y_test_split, y_pred)
        spectral_accuracy_results[name].append(accuracy)

spectral_mean_std_accuracy_results = {}
for name, acc_list in spectral_accuracy_results.items():
    mean_accuracy = np.mean(acc_list)
    std_accuracy = np.std(acc_list)
    spectral_mean_std_accuracy_results[name] = f"{mean_accuracy:.2f} ± {std_accuracy:.2f}"
    
spectral_df_100 = pd.DataFrame([spectral_mean_std_accuracy_results], index=['Spectral Embedding'])
spectral_df_100



Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
Spectral Embedding,0.70 ± 0.06,0.79 ± 0.02,0.81 ± 0.03,0.79 ± 0.04,0.07 ± 0.00,0.79 ± 0.06,0.82 ± 0.02,0.16 ± 0.07


In [29]:
combined_df_10 = pd.concat([no_embedding_df, mds_df_10, isomap_df_10, tsne_df, lle_df_10, spectral_df_10])
combined_df_10

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,1.00 ± 0.00,1.00 ± 0.00,0.93 ± 0.00,0.93 ± 0.00,1.00 ± 0.00,0.97 ± 0.00,1.00 ± 0.00,1.00 ± 0.00
MDS,0.73 ± 0.01,0.77 ± 0.00,0.75 ± 0.03,0.76 ± 0.00,0.73 ± 0.02,0.80 ± 0.01,0.75 ± 0.01,0.74 ± 0.04
Isomap,0.77 ± 0.02,0.75 ± 0.01,0.75 ± 0.01,0.76 ± 0.01,0.76 ± 0.01,0.81 ± 0.02,0.80 ± 0.03,0.75 ± 0.01
t-SNE,0.43 ± 0.15,0.79 ± 0.02,0.72 ± 0.04,0.71 ± 0.08,0.51 ± 0.16,0.64 ± 0.18,0.79 ± 0.03,0.45 ± 0.16
LLE,0.59 ± 0.06,0.70 ± 0.03,0.67 ± 0.03,0.65 ± 0.05,0.08 ± 0.01,0.64 ± 0.08,0.72 ± 0.02,0.39 ± 0.03
Spectral Embedding,0.70 ± 0.06,0.79 ± 0.02,0.81 ± 0.03,0.79 ± 0.04,0.07 ± 0.00,0.79 ± 0.06,0.81 ± 0.03,0.16 ± 0.07


In [30]:
combined_df_100 = pd.concat([no_embedding_df, mds_df_100, isomap_df_100, tsne_df, lle_df_100, spectral_df_100])
combined_df_100

Unnamed: 0,LDA,KNN (K=1),KNN (K=3),KNN (K=5),SVM Linear,SVM RBF,Random Forest,Multinomial Logistic Regression
No Embedding,1.00 ± 0.00,1.00 ± 0.00,0.93 ± 0.00,0.93 ± 0.00,1.00 ± 0.00,0.97 ± 0.00,1.00 ± 0.00,1.00 ± 0.00
MDS,0.73 ± 0.07,0.73 ± 0.03,0.72 ± 0.06,0.72 ± 0.04,0.81 ± 0.05,0.84 ± 0.02,0.72 ± 0.06,0.78 ± 0.08
Isomap,0.74 ± 0.01,0.71 ± 0.02,0.67 ± 0.03,0.67 ± 0.04,0.79 ± 0.03,0.84 ± 0.01,0.78 ± 0.01,0.78 ± 0.02
t-SNE,0.43 ± 0.15,0.79 ± 0.02,0.72 ± 0.04,0.71 ± 0.08,0.51 ± 0.16,0.64 ± 0.18,0.79 ± 0.03,0.45 ± 0.16
LLE,0.84 ± 0.02,0.75 ± 0.00,0.74 ± 0.01,0.75 ± 0.02,0.41 ± 0.00,0.82 ± 0.02,0.81 ± 0.01,0.65 ± 0.02
Spectral Embedding,0.70 ± 0.06,0.79 ± 0.02,0.81 ± 0.03,0.79 ± 0.04,0.07 ± 0.00,0.79 ± 0.06,0.82 ± 0.02,0.16 ± 0.07


In [31]:
### H_0: Embedded performance is less than or equal to no-embedding performance
### H_a: Embedded performance is better than no embedding performance 

In [32]:
from scipy.stats import ttest_ind
techniques_10 = combined_df_10.index.tolist()
techniques_100 = combined_df_100.index.tolist()

In [45]:
def perform_one_sided_ttest(no_embedding_means, technique_means):
    t_stat, p_value = ttest_ind( technique_means,no_embedding_means, equal_var=False, alternative = 'less')
    #t_stat, p_value = ttest_ind(no_embedding_means, technique_means, equal_var=False, alternative = 'less')
    #if t_stat > 0:
        #p_value /= 2
    #else:
        #p_value = 1 - (p_value / 2)
    return p_value

ttest_results_10 = {}
for technique in techniques_10:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_10.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_10.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(technique_means, no_embedding_means)
    ttest_results_10[technique] = p_value

ttest_results_100 = {}
for technique in techniques_100:
    if technique == 'No Embedding':
        continue
    
    # Extract the mean accuracies as floats
    no_embedding_means = combined_df_100.loc['No Embedding'].str.split(' ± ').str[0].astype(float)
    technique_means = combined_df_100.loc[technique].str.split(' ± ').str[0].astype(float)
    
    # Perform the one-sided t-test
    p_value = perform_one_sided_ttest(technique_means, no_embedding_means)
    ttest_results_100[technique] = p_value

In [46]:
print("One-Sided T-Test Results for 10 Components:")
for technique, p_value in ttest_results_10.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")

print("\nOne-Sided T-Test Results for 100 Components:")
for technique, p_value in ttest_results_100.items():
    print(f"{technique}: p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("  Significant difference at p < 0.05, reject H0")
    else:
        print("  No significant difference, fail to reject H0")


One-Sided T-Test Results for 10 Components:
MDS: p-value = 1.0000
  No significant difference, fail to reject H0
Isomap: p-value = 1.0000
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9999
  No significant difference, fail to reject H0
LLE: p-value = 0.9996
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9935
  No significant difference, fail to reject H0

One-Sided T-Test Results for 100 Components:
MDS: p-value = 1.0000
  No significant difference, fail to reject H0
Isomap: p-value = 1.0000
  No significant difference, fail to reject H0
t-SNE: p-value = 0.9999
  No significant difference, fail to reject H0
LLE: p-value = 0.9995
  No significant difference, fail to reject H0
Spectral Embedding: p-value = 0.9933
  No significant difference, fail to reject H0
