In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the Iris dataset
iris = load_iris()
data = iris['data']
target = iris['target']
target_names = iris['target_names']
feature_names = iris['feature_names']

In [None]:
feature_names

In [None]:
# # Standardize the dataset
# scaler = StandardScaler()
# data_standardized = scaler.fit_transform(data)

# # Compute the covariance matrix of the standardized data
# covariance_matrix = np.cov(data_standardized.T)

# # Print the covariance matrix
# print("Covariance Matrix of Standardized Data:")
# print(covariance_matrix)

In [None]:
# Center the data (subtract the mean of each feature)
data_centered = data - np.mean(data, axis=0)

# Compute the covariance matrix of the centered data
covariance_matrix = np.cov(data_centered.T)

# Print the covariance matrix
print("Covariance Matrix of Centered Data:")
print(covariance_matrix)

In [None]:
# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization
pca_result = pca.fit_transform(data_centered)

# Create a DataFrame for visualization
df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
df['Target'] = target



In [None]:
# Plot PCA results
%matplotlib qt

plt.figure(figsize=(10, 8))
colors = ['red', 'green', 'blue']
for i, target_name in enumerate(target_names):
    plt.scatter(
        df[df['Target'] == i]['PC1'],
        df[df['Target'] == i]['PC2'],
        color=colors[i],
        label=target_name,
        alpha=0.7,
        edgecolors='k'
    )

# Add labels and title
plt.title("PCA of Iris Dataset", fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# Sort eigenvalues and eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Compute the principal components
principal_components = np.dot(data_centered, eigenvectors[:, :2])

# Create a DataFrame for visualization
df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
df['Target'] = target

# Plot PCA results
plt.figure(figsize=(10, 8))
colors = ['red', 'green', 'blue']
for i, target_name in enumerate(target_names):
    plt.scatter(
        df[df['Target'] == i]['PC1'],
        df[df['Target'] == i]['PC2'],
        color=colors[i],
        label=target_name,
        alpha=0.7,
        edgecolors='k'
    )

# Add labels and title
plt.title("PCA of Iris Dataset (Using Eigenvalues and Eigenvectors)", fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
eigenvalues

# TSNE

In [None]:
# Apply t-SNE with perplexity 15
tsne = TSNE(n_components=2, perplexity=15, random_state=42)
tsne_results = tsne.fit_transform(data_centered)

# Create a DataFrame for visualization
df_tsne = pd.DataFrame(tsne_results, columns=['Dim1', 'Dim2'])
df_tsne['Target'] = target

# Plot t-SNE results
plt.figure(figsize=(10, 8))
colors = ['red', 'green', 'blue']
for i, target_name in enumerate(target_names):
    plt.scatter(
        df_tsne[df_tsne['Target'] == i]['Dim1'],
        df_tsne[df_tsne['Target'] == i]['Dim2'],
        color=colors[i],
        label=target_name,
        alpha=0.7,
        edgecolors='k'
    )

# Add labels and title
plt.title("t-SNE of Iris Dataset (Perplexity = 15)", fontsize=16)
# plt.xlabel("t-SNE Dimension 1", fontsize=12)
# plt.ylabel("t-SNE Dimension 2", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# UMAP

In [None]:
# Apply UMAP
umap_reducer = umap.UMAP(n_neighbors=15, 
                         min_dist=0.1, 
                         n_components=2, 
                         random_state=42)
umap_results = umap_reducer.fit_transform(data_centered)
# umap_results = umap_reducer.fit_transform(data)

# Create a DataFrame for visualization
df_umap = pd.DataFrame(umap_results, columns=['UMAP1', 'UMAP2'])
df_umap['Target'] = target

# Plot UMAP results
plt.figure(figsize=(10, 8))
colors = ['red', 'green', 'blue']
for i, target_name in enumerate(target_names):
    plt.scatter(
        df_umap[df_umap['Target'] == i]['UMAP1'],
        df_umap[df_umap['Target'] == i]['UMAP2'],

        
        color=colors[i],
        label=target_name,
        alpha=0.7,
        edgecolors='k'
    )

# Add labels and title
plt.title("UMAP of Iris Dataset", fontsize=16)
plt.xlabel("UMAP Dimension 1", fontsize=12)
plt.ylabel("UMAP Dimension 2", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# Star glyphs

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=data, columns=feature_names)
iris_df['species'] = iris.target
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Filter data for Virginica class
virginica_data = iris_df[iris_df['species'] == 'setosa']

# Define features to plot (Iris features are 4)
# features = iris.feature_names

# Create the star glyph visualization
def plot_star_glyphs(data, features, ax):
    # Select only the numeric columns (features)
    data_numeric = data[features]
    
    # Normalize data to make sure all rays have the same length
    data_norm = data_numeric / np.max(data_numeric)
    
    angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False)
    
    # Plot each data point as a star glyph
    for i, row in data_numeric.iterrows():
        values = row.values
        values_norm = values / np.max(values)  # Normalize the values
        
        ax.plot(np.append(angles, angles[0]), np.append(values_norm, values_norm[0]), label=f'Point {i}', marker='o')
        ax.fill(np.append(angles, angles[0]), np.append(values_norm, values_norm[0]), alpha=0.2)

# Create a polar plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})

# Plot the Virginica data as star glyphs
plot_star_glyphs(virginica_data, feature_names, ax)

ax.set_xticks(np.linspace(0, 2 * np.pi, len(feature_names), endpoint=False))
ax.set_xticklabels(feature_names)
ax.set_title('Star Glyphs Representation for Virginica Class (Iris Dataset)', fontsize=14)
plt.show()


In [None]:
max_sepal_length_data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
feature_names = iris.feature_names
data = iris.data
target = iris.target

# Create a DataFrame for the dataset
iris_df = pd.DataFrame(data=data, columns=feature_names)
iris_df['species'] = target
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Filter data for Virginica class
virginica_data = iris_df[iris_df['species'] == 'virginica']

# Select the observation with the maximum sepal length
max_sepal_length_data = virginica_data.loc[virginica_data['sepal length (cm)'].idxmax()]

# Define the desired feature order
ordered_features = ['sepal length (cm)', 'petal width (cm)', 'petal length (cm)', 'sepal width (cm)']

# Create the star glyph visualization for one observation
def plot_single_star_glyph(data_row, features, ax):
    # Reorder the values according to the specified feature order
    values = data_row[features].values

    # Normalize the values to make sure all rays have the same length
    values_norm = values / np.max(values)

    # Angles for each feature, arranged in the specified order
    angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False)
    angles = np.append(angles, angles[0])  # Close the polygon

    # Values for plotting
    values_norm = np.append(values_norm, values_norm[0])  # Close the polygon

    # Plot the star glyph
    ax.plot(angles, values_norm, label='Max Sepal Length', marker='o')
    ax.fill(angles, values_norm, alpha=0.2)

    # Annotate each corner with the original value
    for angle, value, norm_value in zip(angles, values, values_norm[:-1]):  # Skip the duplicate closing point
        ax.text(
            angle, norm_value - 0.1,  # Adjust text position slightly outward
            f"{value:.2f}", 
            ha='center', va='center', fontsize=10, color='blue'
        )

# Create a polar plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})

# Plot the single observation with the maximum sepal length
plot_single_star_glyph(max_sepal_length_data, ordered_features, ax)

# Customize the plot
ax.set_xticks(np.linspace(0, 2 * np.pi, len(ordered_features), endpoint=False))
ax.set_xticklabels(ordered_features)
ax.set_title('Star Glyph Representation for Virginica Observation with Max Sepal Length', fontsize=14)
plt.legend(loc='upper right')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
feature_names = iris.feature_names
data = iris.data
target = iris.target

# Create a DataFrame for the dataset
iris_df = pd.DataFrame(data=data, columns=feature_names)
iris_df['species'] = target
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Filter data for Virginica class
virginica_data = iris_df[iris_df['species'] == 'setosa']

# Select the observation with the maximum sepal length
max_sepal_length_data = virginica_data.loc[virginica_data['sepal length (cm)'].idxmax()]

# Define the desired feature order
ordered_features = ['sepal length (cm)', 'petal width (cm)', 'petal length (cm)', 'sepal width (cm)']

# Create the star glyph visualization for one observation
def plot_single_star_glyph(data_row, features, ax):
    # Reorder the values according to the specified feature order
    values = data_row[features].values

    # Normalize the values to make sure all rays have the same length
    values_norm = values / np.max(values)

    # Angles for each feature, arranged in the specified order
    angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False)
    angles = np.append(angles, angles[0])  # Close the polygon

    # Values for plotting
    values_norm = np.append(values_norm, values_norm[0])  # Close the polygon

    # Plot the star glyph
    ax.plot(angles, values_norm, label='Max Sepal Length', marker='o')
    ax.fill(angles, values_norm, alpha=0.2)

    # Annotate each corner with the original value
    for angle, value, norm_value in zip(angles, values, values_norm[:-1]):  # Skip the duplicate closing point
        ax.text(
            angle, norm_value + 0.05,  # Adjust text position slightly outward
            f"{value:.2f}", 
            ha='center', va='center', fontsize=10, color='blue'
        )

# Create a polar plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})

# Plot the single observation with the maximum sepal length
plot_single_star_glyph(max_sepal_length_data, ordered_features, ax)

# Customize the plot
ax.set_xticks(np.linspace(0, 2 * np.pi, len(ordered_features), endpoint=False))
ax.set_xticklabels(ordered_features)
ax.set_title('Star Glyph Representation for Setosa Observation with Max Sepal Length', fontsize=14)
plt.legend(loc='upper right')
plt.show()

UMPA & Star Glyphs

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
import pandas as pd
import umap

# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Perform UMAP dimensionality reduction
umap_model = umap.UMAP(n_components=2, 
                       min_dist=0.1,
                       n_neighbors= 15,
                       random_state=42)

# umap_result = umap_model.fit_transform(iris.data)
umap_result = umap_model.fit_transform(data_centered)

# Create a DataFrame for UMAP results
umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
umap_df['species'] = iris_df['species']

# Map species labels to integers for color indexing
species_map = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
umap_df['species_int'] = umap_df['species'].map(species_map)

# Function to plot polygon glyphs on the UMAP plot
def plot_umap_polygon_glyphs(data, features, ax, size_factor=0.5):
    num_features = len(features)
    angles = np.linspace(0, 2 * np.pi, num_features, endpoint=False)

    # Plot each data point as a polygon glyph
    for i, row in data.iterrows():
        # Get the feature values for the current data point from the original Iris dataset
        values = iris_df.loc[i, features].values  # Access features from the original dataset
        values_norm = values / np.max(values)  # Normalize the values to range [0, 1]

        # Create the polygon by connecting the normalized values to their corresponding angles
        polygon = np.append(values_norm, values_norm[0])
        polygon_angles = np.append(angles, angles[0])
        
        # Apply the size_factor to scale down the polygon size
        polygon *= size_factor  # Scale the polygon size by the factor
        
        # Map UMAP coordinates to the center of the polygon
        x, y = row['UMAP1'], row['UMAP2']
        
        # Create a polygon around the UMAP coordinates (using scatter)
        polygon_x = x + np.cos(polygon_angles) * polygon
        polygon_y = y + np.sin(polygon_angles) * polygon
        
        # Plot the polygon
        ax.fill(polygon_x, polygon_y, alpha=0.7, color=sns.color_palette("Set2")[row['species_int']])
        ax.plot(polygon_x, polygon_y, alpha=0.7, color='black')

    # Set the UMAP plot limits
    ax.set_xlim(data['UMAP1'].min() - 1, data['UMAP1'].max() + 1)
    ax.set_ylim(data['UMAP2'].min() - 1, data['UMAP2'].max() + 1)

# Create the UMAP plot with polygon glyphs
fig, ax = plt.subplots(figsize=(8, 8))

# Plot UMAP results using polygon glyphs with a reduced size (adjust the size_factor as needed)
plot_umap_polygon_glyphs(umap_df, iris.feature_names, ax, size_factor=0.3)  # Size factor can be adjusted

# Customize the UMAP plot with species labels
sns.scatterplot(x='UMAP1', y='UMAP2', hue='species', data=umap_df, palette='Set2', ax=ax, legend='full')

ax.set_title('UMAP Visualization of Iris Dataset with Smaller Polygon Glyphs', fontsize=14)
plt.show()


In [None]:
# Define min and max values
x_min, x_max = np.min(umap_result[:, 0]), np.max(umap_result[:, 0])
y_min, y_max = np.min(umap_result[:, 1]), np.max(umap_result[:, 1])
print(x_min, x_max)
print(y_min, y_max)

In [None]:
# Define grid resolution
num_grid_points = 100

# Generate grid
x_vals = np.linspace(x_min, x_max, num_grid_points)
y_vals = np.linspace(y_min, y_max, num_grid_points)
xx, yy = np.meshgrid(x_vals, y_vals)
print(yy.shape)
print(y_vals.shape)

In [None]:
%matplotlib qt

plt.figure(figsize=(10, 8))
# Visualize the grid on top of the t-SNE data
plt.scatter(umap_result[:, 0], umap_result[:, 1], c='blue', s=10, label="UMAP Output")
plt.scatter(xx, yy, c='red', s=5, label="Grid Points")
plt.title("2D t-SNE Output with Grid Points")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
# plt.grid(True)
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

# Define the MLP inverse_model
class NNinv(nn.Module):
    def __init__(self, input_size, output_size):
        super(NNinv, self).__init__()
        
        # Define the layers
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),  # Input to first hidden layer
            nn.ReLU(),
            nn.Linear(64, 128),  # First hidden layer to second hidden layer
            nn.ReLU(),
            nn.Linear(128, 256),  # Second hidden layer to third hidden layer
            nn.ReLU(),
            nn.Linear(256, 512),  # Third hidden layer to fourth hidden layer
            nn.ReLU(),
            nn.Linear(512, output_size),  # Fifth hidden layer to output
            nn.Sigmoid()  # Output layer with sigmoid activation
        )
    
    def forward(self, x):
        return self.layers(x)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(umap_result, iris.data, test_size=0.33, random_state=42, stratify=iris.target)
X_train, X_test, y_train, y_test, c_train, c_test = train_test_split(umap_result, data_centered,iris.target, test_size=0.33, random_state=42, stratify=iris.target)


In [None]:
# Example usage
input_size = 2  # Example input size (can be changed)
output_size = 4   # Binary classification (sigmoid output for single output)

# Create DataLoader for batch processing
batch_size = 64
t_X_train = torch.tensor(X_train)
t_y_train = torch.tensor(y_train)
dataset = TensorDataset(t_X_train, t_y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the inverse_model, loss function, and optimizer
inverse_model = NNinv(input_size, output_size)
loss_fn = nn.L1Loss()  # Mean Absolute Error (MAE)
optimizer = optim.Adam(inverse_model.parameters(), lr=0.001)



In [None]:
# Number of epochs to train
num_epochs = 500

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, targets) in enumerate(dataloader):
        # Forward pass
        outputs = inverse_model(inputs)
        loss = loss_fn(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    # Print the average loss for the epoch
    avg_loss = running_loss / len(dataloader)
    # print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("Training complete.")
t_X_test = torch.tensor(X_test)
t_y_test = torch.tensor(y_test)
outputs_test = inverse_model(t_X_test)
loss_test = loss_fn(outputs_test, t_y_test)
print(loss_test/y_test.shape[0])

In [None]:
# Define grid resolution 
num_grid_points = 100

# Generate grid
x_vals = np.linspace(x_min, x_max, num_grid_points)
y_vals = np.linspace(y_min, y_max, num_grid_points)
xx, yy = np.meshgrid(x_vals, y_vals)
grid_points = np.c_[xx.ravel(), yy.ravel()]



jacobian_norms = np.zeros(len(grid_points))
for idx, point in enumerate(grid_points):
    point_tensor = torch.tensor(point, dtype=torch.float32, requires_grad=True).view(1, 2)
    
    # Compute the Jacobian for the current point
    jacobian = torch.autograd.functional.jacobian(lambda x: inverse_model(x), point_tensor)
    
    # Reshape Jacobian to 2D: (output_dim, input_dim)
    jacobian_2d = jacobian.view(4, 2)  # Assuming output is (1, 3), input is (1, 2)
    
    # Compute spectral norm (largest singular value)
    jacobian_norms[idx] = torch.linalg.norm(jacobian_2d, ord=2).item()

jacobian_norms = jacobian_norms.reshape(xx.shape)

# Step 4: Plot heatmap with t-SNE points overlayed
plt.figure(figsize=(10, 8))

# Overlay t-SNE points
# plt.scatter(S[:, 0], S[:, 1], c='blue', edgecolor='k', label='t-SNE points')
c=iris.target
for i in range(3):
    # plt.scatter(umap_result[c == i, 0], umap_result[c == i, 1], color=colors[i], label=f'Gaussian{i+1}', edgecolor=None)
    plt.scatter(umap_result[c == i, 0], umap_result[c == i, 1], label=f'Gaussian{i+1}', edgecolor=None)

# Plot heatmap
plt.imshow(
    jacobian_norms,
    extent=(x_min, x_max, y_min, y_max),
    origin='lower',
    cmap='hot',
    alpha=1
)
plt.colorbar(label='Spectral Norm of Jacobian')




# Labels and title
plt.title("Overlaying UMAP points on Jacobian Heatmap")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
# plt.legend()
plt.show()



# Decisioin Boundary

In [None]:
# Importing necessary libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# Loading the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling the features (important for SVC)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the SVC model with RBF kernel
svc_model = SVC(kernel='rbf', probability=True, random_state=42)
svc_model.fit(X_train_scaled, y_train)

# Reducing the dimensionality of the data with UMAP
umap_model = UMAP(n_components=2, random_state=42)
X_umap = umap_model.fit_transform(X_train_scaled)

# Generating a grid of points for visualization
x_min, x_max = X_umap[:, 0].min() - 1, X_umap[:, 0].max() + 1
y_min, y_max = X_umap[:, 1].min() - 1, X_umap[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# Predicting the class and probabilities for each grid point
grid_points_scaled = scaler.transform(umap_model.inverse_transform(grid_points))
probs = svc_model.predict_proba(grid_points_scaled)

# Reshaping the predicted probabilities
probs_reshaped = probs[:, 1].reshape(xx.shape)

# Plotting the decision boundary
fig = go.Figure()

# Adding decision boundary trace
fig.add_trace(go.Contour(
    x=xx[0],
    y=yy[:, 0],
    z=probs_reshaped,
    colorscale='Viridis',  # You can change this to any other colorscale
    opacity=0.6,
    showscale=True,
    line_smoothing=0.8,
    colorbar=dict(title="Probability")
))

# Adding training points trace
fig.add_trace(go.Scatter(
    x=X_umap[:, 0],
    y=X_umap[:, 1],
    mode='markers',
    marker=dict(
        color=y_train,
        colorscale='Viridis',
        size=10,
        colorbar=dict(title="Species")
    ),
    text=target_names[y_train],
    hoverinfo='text'
))

# Layout settings
fig.update_layout(
    title="Decision Boundary Map with SVC and UMAP",
    xaxis_title='UMAP Component 1',
    yaxis_title='UMAP Component 2',
    showlegend=False
)

fig.show()


In [None]:
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from scipy.linalg import svd

# Step 1: Load Iris dataset and preprocess it
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Standardize the data (important for UMAP)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply UMAP to the data
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean')
X_umap = umap_model.fit_transform(X_scaled)

# Step 3: Create a grid of points in the 2D UMAP space
x_min, x_max = X_umap[:, 0].min() - 1, X_umap[:, 0].max() + 1
y_min, y_max = X_umap[:, 1].min() - 1, X_umap[:, 1].max() + 1

# Define the resolution of the grid
grid_resolution = 50
xx, yy = np.meshgrid(np.linspace(x_min, x_max, grid_resolution), np.linspace(y_min, y_max, grid_resolution))

# Step 4: Define a function to estimate the Jacobian matrix over the grid using inverse UMAP
def jacobian_spectral_norm_grid_inverse(X_grid, model, epsilon=1e-5):
    """Compute the spectral norm of the Jacobian for a grid of points using inverse UMAP."""
    spectral_norms = np.zeros(X_grid.shape[0])
    
    for i, point in enumerate(X_grid):
        # Perturb the point in both dimensions
        jacobian_matrix = np.zeros((X.shape[1], 2))  # Jacobian is high-dimensional x 2
        
        for j in range(2):  # Two dimensions for UMAP output
            perturbed_point = point.copy()
            perturbed_point[j] += epsilon  # Perturb along one axis
            
            # Get the transformed (perturbed) points in the 2D UMAP space
            perturbed_projection = model.transform([perturbed_point])
            original_projection = model.transform([point])
            
            # Apply inverse UMAP to get back to original high-dimensional space
            perturbed_inverse = model.inverse_transform(perturbed_projection)
            original_inverse = model.inverse_transform(original_projection)
            
            # Estimate the Jacobian column by column
            jacobian_matrix[:, j] = (perturbed_inverse - original_inverse) / epsilon
        
        # Compute the spectral norm (largest singular value) of the Jacobian matrix
        _, s, _ = svd(jacobian_matrix)
        spectral_norms[i] = s[0]
    
    return spectral_norms

# Step 5: Flatten the grid and compute the Jacobian's spectral norm for each grid point
grid_points = np.column_stack([xx.ravel(), yy.ravel()])
spectral_norms_grid = jacobian_spectral_norm_grid_inverse(grid_points, umap_model)

# Step 6: Reshape the spectral norms to match the grid shape for visualization
spectral_norms_grid = spectral_norms_grid.reshape(xx.shape)

# Step 7: Visualize the spectral norm heatmap
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, spectral_norms_grid, 20, cmap='viridis')
plt.colorbar(label='Spectral Norm of Jacobian')
plt.title('Spectral Norm Heatmap of Jacobian (Inverse UMAP) for Iris Dataset')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.scatter(X_umap[:, 0], X_umap[:, 1], c='red', s=10, label='Data Points')
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.manifold import trustworthiness




# Load the Iris dataset
iris = load_iris()
data = iris['data']
c = iris['target']

# Center the data (subtract the mean of each feature)
D = data - np.mean(data, axis=0)

# Prepare to calculate trustworthiness for different perplexities
perplexities = [2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 80, 90, 100]
trustworthiness_values = []

method = "tsne"

for perplexity in perplexities:
    if method == "tsne":
        reducer = manifold.TSNE(n_components=2, perplexity=perplexity, init="random", random_state=0)
        method_name = "t-SNE"
    elif method == "umap":
        reducer = umap.UMAP(n_components=2, n_neighbors=perplexity, min_dist=0.1, init="random", random_state=0)
        method_name = "UMAP"
    elif method == "pca":
        reducer = PCA(n_components=2, random_state=0)
        method_name = "PCA"
    
    S = reducer.fit_transform(D)

    # Calculate trustworthiness between the original high-dimensional data and the reduced 2D data
    trust = manifold.trustworthiness(D, S, n_neighbors=70)
    trustworthiness_values.append(trust)

# Plot trustworthiness for different perplexities
plt.figure(figsize=(10, 6))
plt.plot(perplexities, trustworthiness_values, marker='o')
plt.title(f"Trustworthiness vs Perplexity for {method_name}")
plt.xlabel("Perplexity")
plt.ylabel("Trustworthiness")
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from scipy.linalg import norm

# Step 1: Load and preprocess the Iris dataset
data = load_iris()
features = data['data']
labels = data['target']
n_samples = features.shape[0]
n_features = features.shape[1]

# Normalize the features to [0, 1] range
scaler = MinMaxScaler()
norm_features = scaler.fit_transform(features)

# Step 2: Define a grid for visualization
# Assigning each sample to a grid point (e.g., 10x10 grid for 100 samples)
grid_size = int(np.ceil(np.sqrt(n_samples)))
grid = np.zeros((grid_size, grid_size, n_features))
grid_labels = np.zeros((grid_size, grid_size))

# Fill the grid with feature vectors and labels
for idx, feature in enumerate(norm_features):
    i, j = divmod(idx, grid_size)
    grid[i, j, :] = feature
    grid_labels[i, j] = labels[idx]

# Step 3: Compute the Jacobian matrix for each grid point
jacobian_norms = np.zeros((grid_size, grid_size))

def compute_jacobian(grid, i, j):
    """Compute the approximate Jacobian matrix for grid point (i, j)."""
    neighbors = []
    if i > 0:  # Top neighbor
        neighbors.append(grid[i - 1, j])
    if i < grid_size - 1:  # Bottom neighbor
        neighbors.append(grid[i + 1, j])
    if j > 0:  # Left neighbor
        neighbors.append(grid[i, j - 1])
    if j < grid_size - 1:  # Right neighbor
        neighbors.append(grid[i, j + 1])

    jacobian = []
    for neighbor in neighbors:
        jacobian.append(neighbor - grid[i, j])

    return np.array(jacobian).T

# Calculate the spectral norm of the Jacobian for each grid point
for i in range(grid_size):
    for j in range(grid_size):
        jacobian = compute_jacobian(grid, i, j)
        spectral_norm = norm(jacobian, ord=2)  # Spectral norm of the Jacobian
        jacobian_norms[i, j] = spectral_norm

# Step 4: Visualize the spectral norm of the Jacobian as a heatmap
plt.figure(figsize=(8, 6))
plt.imshow(jacobian_norms, cmap='viridis', interpolation='nearest')
plt.colorbar(label='Spectral Norm of Jacobian')
plt.title('Heatmap of Spectral Norm of Jacobian (Iris Dataset)')
plt.xlabel('Grid X-axis')
plt.ylabel('Grid Y-axis')
plt.show()


# Trustworthiness & Continuity

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
import umap
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.manifold import trustworthiness




# Load the Iris dataset
iris = load_iris()
data = iris['data']
c = iris['target']

# Center the data (subtract the mean of each feature)
D = data - np.mean(data, axis=0)

method = "tsne"

perplexity = 15
reducer = manifold.TSNE(n_components=2, perplexity=perplexity, init="random", random_state=0)

n_neighbors=7
metric="euclidean"
X =D
X_emb = reducer.fit_transform(D)

# Calculate trustworthiness between the original high-dimensional data and the reduced 2D data
trust = manifold.trustworthiness(X, X_emb, n_neighbors=n_neighbors)

trust



In [None]:
from zadu_measure.trustworthiness_continuity import measure_tnc
from zadu_measure.neighborhood_hit import measure_nh
from zadu_measure.mean_relative_rank_error import measure_mrre
from zadu_measure.spearman_rho import measure_srcc
from zadu_measure.pearson_r import measure_pcc
from zadu_measure.distance_to_measure import measure_dtm
from zadu_measure.class_aware_trustworthiness_continuity import measure_catnc
from zadu_measure.clustering_and_external_validation_measure import measure_cevmv
from zadu_measure.distance_consistency import measure_dsc
from zadu_measure.internal_validation_measure import measure_ivmv
from zadu_measure.kl_divergence import measure_kl
from zadu_measure.label_trustworthiness_and_continuity import measure_ltnc
from zadu_measure.local_continuity_meta_criteria import measure_lcmc
from zadu_measure.neighbor_dissimilarity import measure_nd
from zadu_measure.non_metric_stress import measure_nms
from zadu_measure.procrustes import measure_procrustes
from zadu_measure.stress import measure_stress
from zadu_measure.scale_normalized_stress import measure_norm_stress
from zadu_measure.topographic_product import measure_topographic
from zadu_measure.projection_precision_score import projection_precision_score_v1
from zadu_measure.neighborhood_preservation_precision import projection_precision_score_common_neig
from zadu_measure.average_local_error import average_local_error


In [None]:
score_tnc = measure_tnc(X, X_emb, k=7, knn_ranking_info=None, return_local=False)
score_nh = measure_nh(X_emb, c, k=7, knn_emb_info=None, return_local=False)
score_mrre = measure_mrre(X, X_emb, k=7, knn_ranking_info=None, return_local=False)
score_srcc = measure_srcc(X, X_emb, distance_matrices=None)
score_spcc = measure_pcc(X, X_emb, distance_matrices=None)
score_dtm = measure_dtm(X, X_emb, sigma=0.1, distance_matrices=None)
score_catnc = measure_catnc(X, X_emb, c, k=7, knn_ranking_info=None, return_local=False)
score_cevmv = measure_cevmv(X_emb, c, measure="arand",  clustering="kmeans", clustering_args=None)
score_dsc = measure_dsc(X_emb, c)
score_ivmv = measure_ivmv(X_emb, c, measure="silhouette")
score_kl = measure_kl(X, X_emb, sigma=0.1, distance_matrices=None)
score_ltnc = measure_ltnc(X, X_emb, c, cvm="dsc")
score_lcmc = measure_lcmc(X, X_emb, k=7, knn_info=None, return_local=False)
score_nd = measure_nd(X, X_emb, k=7, snn_info=None, knn_info=None)
score_nms = measure_nms(X, X_emb,distance_matrices=None)
score_procrustes =  measure_procrustes(X, X_emb, k=7, knn_info=None)
score_stress =  measure_stress(X, X_emb, distance_matrices=None)
score_norm_stress =  measure_norm_stress(X, X_emb, distance_matrices=None) 
score_topographic =  measure_topographic(X, X_emb, k=7, distance_matrices=None, knn_info=None) 
score_pps =  projection_precision_score_v1(X, X_emb, n_neighbors=7)    #close to 0 is good
score_pps_v2 =  projection_precision_score_common_neig(X, X_emb, n_neighbors=7)  # close to 1 is good
score_ale =  average_local_error(X, X_emb)  # close to 0 is good


In [None]:
print(score_tnc['trustworthiness'])
print(score_tnc['continuity'])
print(score_nh['neighborhood_hit'])
print(score_mrre)
print(score_srcc)
print(score_spcc)
print(score_dtm)
print(score_catnc)
print(score_cevmv)
print(score_dsc)
print(score_ivmv)
print(score_kl)
print(score_ltnc)
print(score_lcmc)
print(score_nd)
print(score_nms)
print(score_procrustes)
print(score_stress)
print(score_norm_stress)
print(score_topographic)
print(score_pps)
print(score_pps_v2)
print(score_ale)

In [None]:

import seaborn as sns

def plot_heatmap_padded(ale_scores, title="Average Local Error Heatmap"):
    """
    Plot a heatmap of ALE scores, padding to a square grid if necessary.

    Parameters:
        ale_scores (numpy.ndarray): ALE scores for each point.
        title (str): Title for the heatmap.
    """
    n = len(ale_scores)
    size = int(np.ceil(np.sqrt(n)))  # Find the smallest square grid
    padded_scores = np.pad(ale_scores, (0, size**2 - n), constant_values=np.mean(ale_scores))
    ale_grid = padded_scores.reshape((size, size))

    # Create a heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(ale_grid, cmap="coolwarm", annot=False, cbar=True)
    plt.title(title)
    plt.xlabel("X-axis")
    plt.ylabel("Y-axis")
    plt.show()

plot_heatmap_padded(score_ale, title="Average Local Error Heatmap")

In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

In [None]:
N = X.shape[0]  # Number of data points
    
# # Compute pairwise distances in the high-dimensional and low-dimensional spaces
distances_O = pairwise_distances(X)  # High-dimensional distances
distances_P = pairwise_distances(X_emb)  # Low-dimensional distances

In [None]:
# Get the indices of the n nearest neighbors in the high-dimensional space (excluding the point itself)
# nearest_neighbors_indices = np.argsort(distances_O[i])[:n_neighbors+1][1:]
nearest_neighbors_indices = np.argsort(distances_O[0])[:n_neighbors+1][1:]

In [None]:
nearest_neighbors_indices

In [None]:
# Compute distance vectors for high-dimensional and low-dimensional spaces
d_O = distances_O[0, nearest_neighbors_indices]
d_P = distances_P[0, nearest_neighbors_indices]

In [None]:
d_O

In [None]:
# Normalize the distance vectors to unit length
d_O_normalized = d_O / np.linalg.norm(d_O)
d_P_normalized = d_P / np.linalg.norm(d_P)


In [None]:
np.linalg.norm(d_O_normalized - d_P_normalized)

In [None]:
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

# Compute pairwise distances in the original space (high-dimensional)
dist_X = pairwise_distances(X, metric=metric)

In [None]:
X_emb.shape

In [None]:
dist_X.shape

In [None]:
np.fill_diagonal(dist_X, np.inf)
dist_X

In [None]:
# Sorting to find nearest neighbors in the high-dimensional space
ind_X = np.argsort(dist_X, axis=1)  #why axis 1?
ind_X

In [None]:
# Nearest neighbors in the low-dimensional embedded space
neigh = NearestNeighbors(n_neighbors=n_neighbors)
neigh.fit(X_emb)
ind_X_embedded = neigh.kneighbors(return_distance=False)

In [None]:
ind_X_embedded.shape

In [None]:
n_samples = 150
# n_samples = _num_samples(X)
inverted_index = np.zeros((n_samples, n_samples), dtype=int)
inverted_index.shape


In [None]:
ordered_indices = np.arange(n_samples + 1)
ordered_indices

In [None]:
inverted_index[ordered_indices[:-1, np.newaxis], ind_X] = ordered_indices[1:]
# inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] = ordered_indices[1:]

In [None]:
inverted_index

In [None]:
ind_X_embedded[1]

In [None]:
ind_X[1][:10]  # below positive 3 represents outside the nearest neighbors in the high-dimensional space, 

In [None]:
ranks = (
        inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] - n_neighbors
    )

In [None]:
temp = inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded]
temp[1]

In [None]:
def calculate_continuity(high_dim_dt, low_dim_dt, n_neighbors=7):
    """
    Calculate the Continuity metric for the given high-dimensional data
    and its low-dimensional projection.
    """
    n = high_dim_dt.shape[0]
    
    # Compute pairwise distances and find k-nearest neighbors in both spaces
    dist_X = pairwise_distances(high_dim_dt)
    dist_Y = pairwise_distances(low_dim_dt)
    
    # Get sorted indices based on distances
    neighbors_X = np.argsort(dist_X, axis=1)[:, 1:n_neighbors+1]
    neighbors_Y = np.argsort(dist_Y, axis=1)[:, 1:n_neighbors+1]
    
    # Calculate the continuity penalty
    penalty = 0
    for i in range(n):
        missing_neighbors = set(neighbors_X[i]) - set(neighbors_Y[i])
        # missing_neighbors = set(neighbors_Y[i]) - set(neighbors_X[i])
        for j in missing_neighbors:
            rank_in_Y = np.where(np.argsort(dist_Y[i]) == j)[0][0]
            # rank_in_Y = np.where(np.argsort(dist_X[i]) == j)[0][0]
            penalty += max(0, n_neighbors + 1 - rank_in_Y)
    
    # Normalize the penalty
    normalization_factor = n * n_neighbors * (2 * n - 3 * n_neighbors - 1)
    C = 1 - (2 / normalization_factor) * penalty
    return C

calculate_continuity(X,X_emb)

In [None]:
def calculate_trustworthiness(high_dim_dt, low_dim_dt, n_neighbors=7):
    """
    Calculate the Trustworthiness metric for the given high-dimensional data
    and its low-dimensional projection.
    """
    n = high_dim_dt.shape[0]
    
    # Compute pairwise distances and find k-nearest neighbors in both spaces
    dist_X = pairwise_distances(high_dim_dt)
    dist_Y = pairwise_distances(low_dim_dt)
    
    # Get sorted indices based on distances
    neighbors_X = np.argsort(dist_X, axis=1)[:, 1:n_neighbors+1]
    neighbors_Y = np.argsort(dist_Y, axis=1)[:, 1:n_neighbors+1]
    
    # Calculate the trustworthiness penalty
    penalty = 0
    for i in range(n):
        # Check the neighbors that should be in Y but are not
        missing_neighbors = set(neighbors_Y[i]) - set(neighbors_X[i])
        for j in missing_neighbors:
            rank_in_X = np.where(np.argsort(dist_X[i]) == j)[0][0]  # rank in the high-dimensional space
            penalty += max(0, n_neighbors + 1 - rank_in_X)  # The trustworthiness penalty
    
    # Normalize the penalty
    normalization_factor = n * n_neighbors * (2 * n - 3 * n_neighbors - 1)
    T = 1 - (2 / normalization_factor) * penalty
    return T

# Example usage with X and X_emb:
trustworthiness = calculate_trustworthiness(X, X_emb)
trustworthiness


In [None]:
ranks

In [None]:
ranks.shape

In [None]:
rank_j = ranks[ranks > 0]
rank_j

In [None]:
t = np.sum(ranks[ranks > 0])
t

In [None]:
jj = 1.0 - t * (
        2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0))
    )
jj

In [None]:
ind_X[0, 1:n_neighbors+1]

In [None]:
ind_X_embedded[0]

In [None]:
set(ind_X[0][:7]) - set(ind_X_embedded[0])

In [None]:
# Example distance matrix (dist_X)
dist_X = np.array([
    [np.inf, 0.2, 0.5, 0.8],  # Distances from point 0 to others
    [0.2, np.inf, 0.3, 0.7],  # Distances from point 1 to others
    [0.5, 0.3, np.inf, 0.4],  # Distances from point 2 to others
    [0.8, 0.7, 0.4, np.inf]   # Distances from point 3 to others
])
dist_X

In [None]:
# Sort indices by distance for each point
ind_X = np.argsort(dist_X, axis=1)
ind_X

In [None]:
from datasets import gaussian_dt, iris_dt, digits_dt, covariance_type


In [None]:
D, c = covariance_type()

type(D)

In [None]:
import numpy as np
len(np.unique(c))
D.shape

In [None]:
from sklearn.datasets import load_iris, load_wine, load_digits, fetch_openml
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# Function to fetch UCI datasets via OpenML
def fetch_uci_dataset(name, target_column):
    data = fetch_openml(name=name, as_frame=True, parser="pandas")
    X = data.data
    y = data.target
    if target_column is not None:
        y = X[target_column]
        X = X.drop(columns=[target_column])
    return X, y

# Dictionary to dynamically load datasets
dataset_loaders = {
    "iris": lambda: load_iris(return_X_y=True),
    "wine": lambda: load_wine(return_X_y=True),
    "digits": lambda: load_digits(return_X_y=True),
    "human_activity": lambda: fetch_uci_dataset("HumanActivityRecognitionUsingSmartphones", None),
    "glass": lambda: fetch_uci_dataset("glass", "Type"),
    "breast_cancer": lambda: load_iris(return_X_y=True),  # Replace with OpenML if needed
    "zoo": lambda: fetch_uci_dataset("zoo", "type"),
    "covertype": lambda: fetch_uci_dataset("covertype", "Cover_Type"),
    "newsgroups": lambda: fetch_openml(data_id=110, as_frame=True, parser="pandas"),  # Replace with text processing logic
}

# Storage for datasets
datasets = {}

# Load each dataset dynamically
for name, loader in dataset_loaders.items():
    print(f"Loading dataset: {name}")
    try:
        X, y = loader()
        # Ensure labels are numeric for consistency
        if isinstance(y, pd.Series):
            y = LabelEncoder().fit_transform(y)
        datasets[name] = {"data": np.array(X), "labels": np.array(y)}
        print(f"Dataset {name} loaded successfully.")
    except Exception as e:
        print(f"Failed to load dataset {name}: {e}")

# Access data and labels
for dataset_name, dataset_content in datasets.items():
    data = dataset_content["data"]
    labels = dataset_content["labels"]
    print(f"Dataset: {dataset_name}, Data Shape: {data.shape}, Labels Shape: {labels.shape}")


In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the Breast Cancer dataset
data = load_breast_cancer()

# Extract features and target labels
X = data.data  # Feature data
y = data.target  # Target labels

# Convert to DataFrame for better readability
breast_cancer_df = pd.DataFrame(X, columns=data.feature_names)
breast_cancer_df['class'] = y

# Display basic information about the dataset
print("Dataset shape:", breast_cancer_df.shape)
print("Classes:", breast_cancer_df['class'].unique())
print("First five rows:")
print(breast_cancer_df.head())

# Check class distribution
class_counts = breast_cancer_df['class'].value_counts()
print("\nClass distribution:")
print(class_counts)


In [None]:
from PIL import Image
import os

def load_images_from_folder(folder_path, limit=30):
    """
    Load PNG files from the specified folder, limited to the first 'limit' files.
    
    Parameters:
    - folder_path (str): Path to the folder containing PNG files.
    - limit (int): Maximum number of images to load.

    Returns:
    - list of str: List of file paths to PNG images (up to the specified limit).
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    image_files = []
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".png"):  # Check for PNG files
            image_files.append(os.path.join(folder_path, file_name))
    
    # Sort the images and take only the first 'limit' images
    return sorted(image_files)[:limit]


def create_collage(image_files, output_file, images_per_row=4):
    """
    Combine multiple PNG files into a single image with the original resolution.
    """
    # Load images
    images = [Image.open(file) for file in image_files]
    
    # Get the size of the images (assuming all have the same size)
    image_width, image_height = images[0].size

    # Calculate the dimensions of the collage
    rows = (len(images) + images_per_row - 1) // images_per_row
    cols = min(len(images), images_per_row)
    collage_width = cols * image_width
    collage_height = rows * image_height

    # Create a blank canvas for the collage
    collage = Image.new("RGB", (collage_width, collage_height), "white")

    # Paste images into the collage
    for index, image in enumerate(images):
        row, col = divmod(index, images_per_row)
        x = col * image_width
        y = row * image_height
        collage.paste(image, (x, y))

    # Save the collage
    collage.save(output_file)
    print(f"Collage saved as {output_file}")

# Use the relative path
dataset = 'tetrahedron_eq_2_close'
# dataset = 'high_dim'
method = 'tsne'
num_dim = 200
# output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}_{num_dim}/{method}_plots_new_model"
output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}/{method}_plots_new_model"

# Ensure the folder exists and load PNG files dynamically
image_files = load_images_from_folder(output_folder)

if not image_files:
    print("No PNG files found in the specified folder!")
else:
    output_file = f"collage_{dataset}_{method}.png"
    create_collage(image_files, output_file, images_per_row=2)


In [None]:
from PIL import Image
import os

def load_images_from_folder(folder_path, prefixes, limit=30):
    """
    Load PNG files from the specified folder that start with any of the given prefixes.
    
    Parameters:
    - folder_path (str): Path to the folder containing PNG files.
    - prefixes (list of str): List of words that filenames should start with.
    - limit (int): Maximum number of images to load.

    Returns:
    - list of str: List of file paths to selected PNG images (up to the specified limit).
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    selected_files = []
    
    for file_name in sorted(os.listdir(folder_path)):  # Sort for consistency
        if file_name.lower().endswith(".png") and any(file_name.startswith(prefix) for prefix in prefixes):
            selected_files.append(os.path.join(folder_path, file_name))
        
        if len(selected_files) >= limit:  # Stop when limit is reached
            break
    
    return selected_files

from PIL import Image, ImageDraw, ImageFont

from PIL import Image, ImageDraw, ImageFont

def create_collage(image_files, output_file, images_per_row=4, title="Collage Title"):
    """
    Combine multiple PNG files into a single image with uniform size and add a title.
    """
    images = [Image.open(file) for file in image_files]
    
    # Find the smallest width and height among all images
    min_width = min(img.width for img in images)
    min_height = min(img.height for img in images)

    # Resize images to the smallest found size
    images = [img.resize((min_width, min_height)) for img in images]

    # Compute collage dimensions
    num_images = len(images)
    cols = images_per_row
    rows = (num_images + cols - 1) // cols

    collage_width = cols * min_width
    collage_height = rows * min_height

    # Add extra space for the title (e.g., 80 pixels height)
    title_height = 80
    total_height = collage_height + title_height

    # Create a blank canvas with extra space for title
    collage = Image.new("RGB", (collage_width, total_height), "white")

    # Add title
    draw = ImageDraw.Draw(collage)
    try:
        font = ImageFont.truetype("arial.ttf", 80)  # Load a font (adjust size)
    except IOError:
        font = ImageFont.load_default()  # Use default font if not found

    # Get text size using textbbox() instead of textsize()
    text_bbox = draw.textbbox((0, 0), title, font=font)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]

    # Center the title
    text_x = (collage_width - text_width) // 2  
    text_y = 20  # Padding from the top
    draw.text((text_x, text_y), title, fill="black", font=font)

    # Paste images below the title
    for index, image in enumerate(images):
        row, col = divmod(index, cols)
        x = col * min_width
        y = title_height + row * min_height  # Offset by title height
        collage.paste(image, (x, y))

    # Save the final collage with high quality
    collage.save(output_file, format="PNG", dpi=(300, 300))

    print(f"✅ Collage saved as {output_file}")



# def create_collage(image_files, output_file, images_per_row=4):
#     """
#     Combine multiple PNG files into a single image with uniform size.
#     """
#     # Load images and resize to the smallest found size (to avoid overlapping)
#     images = [Image.open(file) for file in image_files]
    
#     # Find the smallest width and height among all images (to maintain uniformity)
#     min_width = min(img.width for img in images)
#     min_height = min(img.height for img in images)
    
#     # Resize all images to the smallest found size
#     images = [img.resize((min_width, min_height)) for img in images]

#     # Calculate the collage dimensions
#     num_images = len(images)
#     cols = images_per_row  # Fixed number of columns
#     rows = (num_images + cols - 1) // cols  # Compute necessary rows

#     collage_width = cols * min_width
#     collage_height = rows * min_height

#     # Create a blank white canvas
#     collage = Image.new("RGB", (collage_width, collage_height), "white")

#     # Paste images into the collage
#     for index, image in enumerate(images):
#         row, col = divmod(index, cols)
#         x = col * min_width
#         y = row * min_height
#         collage.paste(image, (x, y))

#     # Save the final collage
#     # collage.save(output_file, quality=95)
#     collage.save(output_file, format="PNG", dpi=(300, 300))

#     print(f"✅ Collage saved as {output_file}")


# Use the relative path
dataset = 'tetrahedron_eq_2_close'
method = 'tsne'
num_dim = 200
output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}/{method}_plots_new_model"

prefix_list = ["HD_clust_distance", "spectral_norm", "barycentric_interpolation",
               'ftle_interpolate', 'delaunay_tri_MAX', 'Fully_connected']  # List of words to filter by
# Load PNG files dynamically
image_files = load_images_from_folder(output_folder, prefix_list)

if not image_files:
    print("❌ No PNG files found in the specified folder!")
else:
    output_file = f"collage_{dataset}_{method}.png"
    create_collage(image_files, output_file, images_per_row=4, title= dataset)  # Set number of columns


In [14]:
from PIL import Image, ImageDraw, ImageFont
import os

from PIL import Image
import os

def load_images_from_folder(folder_path, prefixes, limit=30):
    """
    Load PNG files from the specified folder that start with any of the given prefixes.
    
    Parameters:
    - folder_path (str): Path to the folder containing PNG files.
    - prefixes (list of str): List of words that filenames should start with.
    - limit (int): Maximum number of images to load.

    Returns:
    - list of str: List of file paths to selected PNG images (up to the specified limit).
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    selected_files = []
    
    for file_name in sorted(os.listdir(folder_path)):  # Sort for consistency
        if file_name.lower().endswith(".png") and any(file_name.startswith(prefix) for prefix in prefixes):
            selected_files.append(os.path.join(folder_path, file_name))
        
        if len(selected_files) >= limit:  # Stop when limit is reached
            break
    
    return selected_files

def create_collage(image_files, output_file, images_per_row=4, title="Collage Title"):
    """
    Create a collage from images and add a title and individual figure labels.
    
    Parameters:
    - image_files (list): List of image file paths.
    - output_file (str): Path to save the final collage.
    - images_per_row (int): Number of images per row.
    - title (str): Title for the collage.
    """
    images = [Image.open(file) for file in image_files]
    
    # Find the smallest width and height among all images
    min_width = min(img.width for img in images)
    min_height = min(img.height for img in images)

    # Resize images to uniform size
    images = [img.resize((min_width, min_height)) for img in images]

    # Compute collage dimensions
    num_images = len(images)
    cols = images_per_row
    rows = (num_images + cols - 1) // cols

    collage_width = cols * min_width
    title_height = 100  # Space for the main title
    figure_label_height = 40  # Space for each figure label
    collage_height = rows * (min_height + figure_label_height)  # Add space for labels
    total_height = collage_height + title_height  # Total height including main title

    # Create a blank canvas with space for title and labels
    collage = Image.new("RGB", (collage_width, total_height), "white")

    # Draw title
    draw = ImageDraw.Draw(collage)
    try:
        font_title = ImageFont.truetype("arial.ttf", 40)  # Title font
        font_label = ImageFont.truetype("arial.ttf", 30)  # Label font
    except IOError:
        font_title = ImageFont.load_default()
        font_label = ImageFont.load_default()

    # Get text size for the main title
    text_bbox = draw.textbbox((0, 0), title, font=font_title)
    text_width = text_bbox[2] - text_bbox[0]
    text_x = (collage_width - text_width) // 2
    text_y = 20  # Title at the top
    draw.text((text_x, text_y), title, fill="black", font=font_title)

    # Paste images and add labels
    for index, (image, file_path) in enumerate(zip(images, image_files)):
        row, col = divmod(index, cols)
        x = col * min_width
        y = title_height + row * (min_height + figure_label_height)  # Adjust for label

        collage.paste(image, (x, y))

        # Extract filename (without extension) for the label
        label = os.path.basename(file_path).split('.')[0]
        label = label[:20] + "..." if len(label) > 23 else label  # Trim long names

        # Get label text size and center it
        label_bbox = draw.textbbox((0, 0), label, font=font_label)
        label_width = label_bbox[2] - label_bbox[0]
        label_x = x + (min_width - label_width) // 2
        label_y = y + min_height + 5  # Position below image
        draw.text((label_x, label_y), label, fill="black", font=font_label)

    # Save the final collage
    collage.save(output_file, format="PNG", dpi=(300, 300))
    print(f"✅ Collage saved as {output_file}")

# # Example Usage
# output_file = f"collage_{dataset}_{method}.png"
# create_collage(image_files, output_file, images_per_row=4, title=dataset)


# Use the relative path
# dataset = 'high_dim'
dataset = 'digits'
method = 'tsne'
num_dim = 500
# if dataset == high_dim:

#     output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}_{high_dim}/{method}_plots_new_model"
# else:
#     output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}/{method}_plots_new_model"

###_____________ Output folder__________________________________
if dataset == 'high_dim':
    output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}_{num_dim}/{method}_plots_new_model"
    dataset = f'{dataset}_{num_dim}'
else:
    output_folder = f"thesis_reproduced/testing_new/new_final_results/{dataset}/{method}_plots_new_model"

##_____________________________________________________________________________

prefix_list = ["HD_clust_distance", "spectral_norm_t-SNE", "barycentric",
                'Delanay_with_selected_sub','Fully_sub_connected']  # List of words to filter by



# Load PNG files dynamically
image_files = load_images_from_folder(output_folder, prefix_list)

if not image_files:
    print("❌ No PNG files found in the specified folder!")
else:
    output_file = f"collage_{dataset}_{method}.png"
    create_collage(image_files, output_file, images_per_row=4, title= dataset)  # Set number of columns



✅ Collage saved as collage_digits_tsne.png


In [None]:
dataset = 'gaussian'
method = 'tsne'
prj_metric_path_1 = f"thesis_reproduced/testing_new/final_results/{dataset}/{method}_plots_old_model/hd_ld_metrics/{dataset}_{method}_prj_metrics_hd_ld.pkl"
prj_metric_path_2 = f"thesis_reproduced/testing_new/final_results/Upload_documents_final_results/{dataset}/{method}_plots_old_model/hd_ld_metrics/{dataset}_{method}_prj_metrics_hd_ld.pkl"
jacob_path_1 = f"thesis_reproduced/testing_new/final_results/{dataset}/{method}_plots_old_model/jacobian_norms/{dataset}_{method}_jacobian_norm.pkl"
jacob_path_2 = f"thesis_reproduced/testing_new/final_results/Upload_documents_final_results/{dataset}/{method}_plots_old_model/jacobian_norms/{dataset}_{method}_jacobian_norm.pkl"

In [None]:
prj_metric_path_2
E:\inverse_projection_visual_analytics\thesis_reproduced\testing_new\final_results\Upload_documents_final_results\gaussian\tsne_plots_old_model\hd_ld_metrics

In [None]:
from utility import *

prj_metric_1 = load_metrics(prj_metric_path_1)
jacob_norm_1 = load_metrics(jacob_path_1)
prj_metric_2 = load_metrics(prj_metric_path_2)
jacob_norm_2 = load_metrics(jacob_path_2)

In [None]:
jacob_norm

In [None]:
prj_metric

In [None]:
import numpy as np
dist= np.array([[0.        , 0.0612345 , 0.06683991, 0.08736833, 0.0391083 ],
       [0.0612345 , 0.        , 0.0586919 , 0.04931701, 0.07375501],
       [0.06683991, 0.0586919 , 0.        , 0.03876263, 0.05326245],
       [0.08736833, 0.04931701, 0.03876263, 0.        , 0.08626064],
       [0.0391083 , 0.07375501, 0.05326245, 0.08626064, 0.        ]])

In [None]:
np.mean(dist)

In [None]:
list = [[0.        , 0.0612345 , 0.06683991, 0.08736833, 0.0391083 ,
        1.06945709, 1.05681305, 1.07188269, 1.03027446],
       [0.0612345 , 0.        , 0.0586919 , 0.04931701, 0.07375501,
        1.0862432 , 1.07366997, 1.08710053, 1.04968977],
       [0.06683991, 0.0586919 , 0.        , 0.03876263, 0.05326245,
        1.07375474, 1.06170735, 1.07692937, 1.03563016],
       [0.08736833, 0.04931701, 0.03876263, 0.        , 0.08626064,
        1.06964567, 1.05753809, 1.07147128, 1.03337137],
       [0.0391083 , 0.07375501, 0.05326245, 0.08626064, 0.        ,
        1.0895343 , 1.07721707, 1.09320604, 1.04960552]]

In [None]:
np.mean(list)