In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
dim = 3
n_gauss = 3
n_pts_per_gauss = 300
np.random.seed(5)

centers = np.zeros((n_gauss,dim))
for i in range(1,n_gauss):
    centers[i] = np.random.randint(0,2,3)
    
print(centers)

cov_m = [np.diag([0.01 for i in range(dim)]),np.diag([0.1 if i%2 !=0 else 0.1 for i in range(dim)])]

D = np.zeros((n_pts_per_gauss*n_gauss,dim))
c = np.zeros(n_pts_per_gauss*n_gauss)
for i in range(dim):
    k = np.random.randint(0,2,1)[0]
    D[i*n_pts_per_gauss:(i+1)*n_pts_per_gauss] = np.random.multivariate_normal(centers[i],cov_m[k],n_pts_per_gauss)
    breakpoint()
    c[i*n_pts_per_gauss:(i+1)*n_pts_per_gauss] = i
breakpoint()    
D = (D-np.min(D,axis=0))/(np.max(D,axis=0)-np.min(D,axis=0))
print(D.shape)
print(c.shape)

[[0. 0. 0.]
 [1. 0. 1.]
 [1. 0. 0.]]
(900, 3)
(900,)


# Apply Projection 3D --> 2D

### PCA

In [3]:
# Apply PCA to the 3D Gaussian data
pca_model = PCA(
    n_components=2,
    random_state=0,
    )


# Apply PCA on the 3D Gaussian data `D`
p = pca_model.fit_transform(D)

In [4]:
print(D.shape)
print(p.shape)

(900, 3)
(900, 2)


In [5]:
# Plotting the PCA results with the same color scheme
%matplotlib qt

colors = ['r', 'g', 'b']  # Red, Green, Blue
plt.figure(figsize=(10, 8))
for i in range(n_gauss):
    plt.scatter(p[c == i, 0], p[c == i, 1], color=colors[i], label=f'Gaussian {i+1}')

plt.title('PCA Visualization of 3D Gaussian Distributions into 2D')
plt.legend()
plt.show()

# Inverse Projection 2D --> 3D

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

## Inverse Projection using Neural Network

In [7]:
# Define the MLP model
class NNinv(nn.Module):
    def __init__(self, input_size, output_size):
        super(NNinv, self).__init__()
        
        # Define the layers
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),  # Input to first hidden layer
            nn.ReLU(),
            nn.Linear(64, 128),  # First hidden layer to second hidden layer
            nn.ReLU(),
            nn.Linear(128, 256),  # Second hidden layer to third hidden layer
            nn.ReLU(),
            nn.Linear(256, 512),  # Third hidden layer to fourth hidden layer
            nn.ReLU(),
            nn.Linear(512, output_size),  # Fifth hidden layer to output
            nn.Sigmoid()  # Output layer with sigmoid activation
        )
    
    def forward(self, x):
        return self.layers(x)


In [8]:
# X_train, X_test, y_train, y_test = train_test_split(S, D, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test, c_train, c_test = train_test_split(p, D,c, test_size=0.33, random_state=42)

### Model Training

In [9]:
# Example usage
input_size = 2  # Example input size (can be changed)
output_size = dim   # Binary classification (sigmoid output for single output)
model = NNinv(input_size, output_size)

# Create DataLoader for batch processing
batch_size = 64
t_X_train = torch.tensor(X_train)
t_y_train = torch.tensor(y_train)
dataset = TensorDataset(t_X_train, t_y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model, loss function, and optimizer
model = NNinv(input_size, output_size)
loss_fn = nn.L1Loss()  # Mean Absolute Error (MAE)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Number of epochs to train
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, targets) in enumerate(dataloader):
        # Forward pass
        outputs = model(inputs.float())
        loss = loss_fn(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    # Print the average loss for the epoch
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("Training complete.")

Epoch [1/5], Loss: 0.2185
Epoch [2/5], Loss: 0.1690
Epoch [3/5], Loss: 0.1216
Epoch [4/5], Loss: 0.0893
Epoch [5/5], Loss: 0.0688
Training complete.


### Model Testing

In [10]:
t_X_test = torch.tensor(X_test)
t_y_test = torch.tensor(y_test)
outputs_test = model(t_X_test.float())
loss_test = loss_fn(outputs_test, t_y_test)
print(loss_test/y_test.shape[0])

tensor(0.0002, dtype=torch.float64, grad_fn=<DivBackward0>)


## Visualizing Inverse Projection

Comparing original datapoints with the predictions

In [11]:
%matplotlib qt

# Create a figure and 3D axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(projection='3d')

# Define colors for each Gaussian distribution
# colors = ['r', 'g', 'b']  # Red, Green, Blue


output_fin = outputs_test.detach().numpy()
# Loop through each Gaussian to plot points with corresponding color
for i in range(n_gauss):
    ax.scatter(t_y_test[c_test == i, 0], t_y_test[c_test == i, 1], t_y_test[c_test == i, 2], color=colors[i], label=f'Actual_Gaussian {i+1}')
    # ax.scatter(output_fin[c_test == i, 0], output_fin[c_test == i, 1], output_fin[c_test == i, 2], color='orange', label=f'Predicted_Gaussian {i+1}')

ax.scatter(output_fin[:, 0], output_fin[:, 1], output_fin[:, 2], color='orange', label=f'Predicted_Gaussians')

# Set labels and title
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_zlabel('Z-axis')
ax.set_title('PCA \n Actual Vs Prediction')

# Add a legend
ax.legend()

# Show the plot
plt.show()

## Validating 2D projection

#### **generate_spread_points()** is a function to generate new points around the spread of the Gaussian clusters in 2D space.

In [12]:
def generate_spread_points(S, labels, num_new_points_per_cluster=5, spread_factor=0.5):
    """
    Generate new points around the spread of the Gaussian clusters in 2D space.
    
    Parameters:
    S (np.array): 2D points (original).
    labels (np.array): Labels for the original points, corresponding to Gaussian clusters.
    num_new_points_per_cluster (int): Number of new points to generate per Gaussian cluster.
    spread_factor (float): Spread factor controlling the variance of new points.
    
    Returns:
    new_points (np.array): Newly generated points spread around each cluster.
    new_labels (np.array): Labels corresponding to the new points.
    """
    new_points = []
    new_labels = []
    
    # Get the unique labels (each label corresponds to one Gaussian)
    unique_labels = np.unique(labels)

    for label in unique_labels:
        # Get the points that belong to the current Gaussian cluster
        cluster_points = S[labels == label]
        
        # Calculate covariance matrix for the current cluster
        cluster_cov = np.cov(cluster_points.T)

        for _ in range(num_new_points_per_cluster):
            # Randomly choose a point within the cluster
            random_point = cluster_points[np.random.randint(len(cluster_points))]
            
            # Generate a random offset using the covariance matrix to create a spread
            offset = np.random.multivariate_normal([0, 0], spread_factor * cluster_cov)

            # Add the offset to the selected random point to create a new point
            new_point = random_point + offset
            new_points.append(new_point)
            new_labels.append(label)  # Assign the same label as the original points
    
    return np.array(new_points), np.array(new_labels)

In [13]:
new_points, new_labels = generate_spread_points(p, c, num_new_points_per_cluster=20, spread_factor=0.3)

### Generate new points in 2D Space (Projection)

In [14]:
# pred_color = ['y', 'purple', 'k']
pred_color = ['cyan', 'magenta', 'yellow']
markers = ['$G1$', '$G2$', '$G3$']

In [15]:
plt.figure(figsize=(10, 8))
for i in range(n_gauss):
    plt.scatter(p[c == i, 0], p[c == i, 1], color=colors[i], label=f'Gaussian {i+1}')

    # Plot new points
    plt.scatter(new_points[new_labels == i, 0], new_points[new_labels == i, 1], color=pred_color[i],marker = markers[i] , s = 100, edgecolors='black', label= f'New Points_Gaussian {i+1}')

# plt.scatter(new_points[:, 0], new_points[:, 1], color='brown', label="New Points")

plt.legend()
plt.title("Original and Generated Points in 2D PCA Space")
plt.show()


### Apply trained model on new points

In [16]:
new_points_test = torch.tensor(new_points).float()
outputs_new_points = model(new_points_test)
outputs_new_points =outputs_new_points.detach().numpy()

### Visualize new points (2D) in original distribution (3D)

In [17]:
%matplotlib qt

# Create a figure and 3D axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(projection='3d')

# Loop through each Gaussian to plot points with corresponding color
for i in range(n_gauss):
    ax.scatter(D[c == i, 0], D[c == i, 1], D[c == i, 2], color=colors[i], alpha=0.7, label=f'Gaussian {i+1}')
    ax.scatter(outputs_new_points[new_labels == i, 0], outputs_new_points[new_labels == i, 1], outputs_new_points[new_labels == i, 2], color=pred_color[i], marker=markers[i],alpha=1.0, s=150, edgecolors='black', label=f'New_points_Gaussian {i+1}')

# ax.scatter(outputs_new_points[:, 0], outputs_new_points[:, 1], outputs_new_points[:, 2], color='k', label=f'Predicted_Gaussian {i+1}')

# Set labels and title
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_zlabel('Z-axis')
ax.set_title('New points (2D PCA) mapping into 3D Gaussian Distributions')

# Add a legend
ax.legend()

# Show the plot
plt.show()