In [1]:
import pandas as pd
import numpy as np
import torch as th
import paradime
import paradime.dr
import paradime.loss
import paradime.routines
import paradime.utils
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import torchvision
import json

  def _entropy(dists: np.ndarray, beta: float) -> float:


In [2]:
# Download Iris dataset
covertype = sklearn.datasets.fetch_covtype()

_, counts = np.unique(covertype.target, return_counts=True)
weights = np.array([ 1/counts[i-1] for i in covertype.target ])

indices = list(torch.utils.data.WeightedRandomSampler(weights, 7000))

raw_data = covertype.data[indices,:10]
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(raw_data)
data = scaler.transform(raw_data)

label_to_name = {
    1: "Spruce/fir",
    2: "Lodgepole pine",
    3: "Ponderosa pine",
    4: "Cottonwood/willow",
    5: "Aspen",
    6: "Douglas-fir",
    7: "Krummholz",
}

labels = covertype.target[indices]

same_label = (np.outer(labels, np.ones_like(labels))
    - np.outer(np.ones_like(labels), labels) == 0).astype(float)

NameError: name 'sklearn' is not defined

In [None]:
class twoNAMHybrid(th.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, output_dim=2, num_layers=1):
        super(twoNAMHybrid, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.submodules = th.nn.ModuleList()
        self.alpha = th.nn.Parameter(th.tensor(1.0))

        # Create the submodules for each input feature
        for i in range(input_dim):
            submodule = th.nn.Sequential()
            # Add layers to the submodule
            for l in range(num_layers):
                if l == 0:
                    submodule.add_module(f"linear_{l}", th.nn.Linear(1, hidden_dim))
                else:
                    submodule.add_module(f"linear_{l}", th.nn.Linear(hidden_dim, hidden_dim))
                submodule.add_module(f"ELU_{l}", th.nn.ELU())
                submodule.add_module(f"dropout_{l}", th.nn.Dropout(0.5))

            # Add the output layer
            submodule.add_module(f"linear_{num_layers}", th.nn.Linear(hidden_dim, hidden_dim))
            self.submodules.append(submodule)

        # Add the final layer
        self.emb_layer = th.nn.Linear(input_dim * hidden_dim, output_dim)
        self.class_layer = th.nn.Linear(input_dim * hidden_dim, num_classes)

    def common_forward(self, x):
        # Initialize a list to store the outputs of submodules
        output = []
        for i in range(self.input_dim):
            # Compute the output of the i-th submodule and append it to the list
            output.append(self.submodules[i](x[:, i].unsqueeze(1)).squeeze())
        # Concatenate the outputs along the first dimension
        output = th.cat(output, dim=1)
        return output

    def embed(self, x):
        x = self.common_forward(x)
        x = self.emb_layer(x)
        return x

    def classify(self, x):
        x = self.common_forward(x)
        x = self.class_layer(x)
        return x

In [None]:
def pca(x):
    return sklearn.decomposition.PCA(n_components=2).fit_transform(x)

derived = paradime.dr.DerivedData(pca)

In [None]:

tsne_global_rel = paradime.relations.NeighborBasedPDist(
    transform=[
        paradime.transforms.PerplexityBasedRescale(
            perplexity=200, bracket=[0.001, 1000]
        ),
        paradime.transforms.Symmetrize(),
        paradime.transforms.Normalize(),
    ]
)

:
tsne_batch_rel = paradime.relations.DifferentiablePDist(
    transform=[
        paradime.transforms.StudentTTransform(alpha=1.0),
        paradime.transforms.Normalize(),
        paradime.transforms.ToSquareTensor(),
    ]
)


embeddings = []

class TripletLoss(paradime.loss.Loss):
    """Triplet loss for supervised DR.

    To be used with negative edge sampling with sampling rate 1.
    """

    def __init__(self, margin=1.0, name=None):
        super().__init__(name)

        self.margin = margin

    def forward(self, model, global_relations,  batch_relations, batch, device):

        data = batch['from_to_data'].to(device)
        # data consists of [[a0, a0, a1, a1, ...], [p0, n0, p1, n1, ...]]

        anchor = model(data[0,::2])
        positive = model(data[1,::2])
        negative = model(data[1,1::2])

        loss = torch.nn.TripletMarginLoss(margin=self.margin)

        return loss(anchor, positive, negative)
    
new_losses = {
    "init": paradime.loss.PositionLoss(position_key="pca"),
    "embedding": paradime.loss.RelationLoss(
        loss_function=paradime.loss.kullback_leibler_div,
        global_relation_key="tsne",
    ),
    "triplet": TripletLoss(),
}

tsne_init = paradime.dr.TrainingPhase(
    name="pca_init",
    loss_keys=["init"],
    batch_size=500,
    epochs=10,
    learning_rate=0.01,
)


In [None]:

super_tsne = paradime.dr.ParametricDR(
    model=twoNAMHybrid(
        input_dim=2, hidden_dim=100, num_classes=10, output_dim=2,
    )
    global_relations={
        "tsne": tsne_global_rel,
        "same_label": paradime.relations.Precomputed(same_label),
    },
    batch_relations=tsne_batch_rel,
    losses=new_losses,
    derived_data={"pca": derived},
    use_cuda=True,
    verbose=True,
)

###########

super_tsne.add_training_phase(tsne_init)
super_tsne.add_training_phase(
    name="embedding",
    loss_keys=["embedding", "triplet"],
    loss_weights=[700, 1],
    sampling="negative_edge",
    neg_sampling_rate=1,
    edge_rel_key="same_label",
    batch_size=300,
    epochs=40,
    learning_rate=0.02,
    report_interval=2,
)
super_tsne.train(data)

#embeddings.append(hybrid_tsne.apply(iris_subset, "embed"))

In [None]:
paradime.utils.plotting.scatterplot(
    super_tsne.apply(data),
    labels=[label_to_name[i] for i in covertype.target[indices]],
)

In [None]:
# Define batch size
batch_size = 500

# Initialize an empty tensor to store all embeddings
all_embeddings = th.Tensor()

# Compute embeddings for each batch
for i in range(0, len(iris_data), batch_size):
    batch_data = iris_data[i : i + batch_size]
    batch_embeddings = hybrid_tsne.apply(batch_data, "embed")
    all_embeddings = th.cat([all_embeddings, batch_embeddings])

# Move all embeddings to cpu
all_embeddings = all_embeddings.cpu()

# Plot
fig = plt.figure(figsize=(15, 5))
ax = fig.add_subplot(1, 3, 1)

paradime.utils.plotting.scatterplot(
    all_embeddings,
    labels=iris_targets,
    ax=ax,
    legend=True,
    legend_options={"loc": 3},
)

ax.set_title(f"t-SNE visualization of Iris dataset - additional Datapoints")

In [None]:
# Select a data point
idx = 0
data_point = iris_data[idx]

# Ensure the data point has the correct dimensions
if len(data_point.shape) == 1:
    data_point = data_point.view(1, -1)

# Move the data to the GPU
data_point = data_point.to('cuda')

# Get the output of each submodule and compute the mean
output = []
for i, submodule in enumerate(hybrid_tsne.model.submodules):
    submodule_output = submodule(data_point[:, i].unsqueeze(1))
    # Ensure the output is a 2D tensor
    if len(submodule_output.shape) == 1:
        submodule_output = submodule_output.view(1, -1)
    # Compute the mean of the output vector
    mean_contribution = th.mean(submodule_output)
    output.append(mean_contribution)

# Convert the list to a tensor
output = th.stack(output)

# Move the output back to the CPU for plotting
output = output.to('cpu')

# Create a figure
fig, ax = plt.subplots()

# Plot the outputs of the submodules
ax.bar(range(hybrid_tsne.model.input_dim), output.detach().numpy())

# Set the title and labels
ax.set_title('Principal Component contributions for data point {}'.format(idx))
ax.set_xlabel('Principal Component')
ax.set_ylabel('Contribution')

# Show the plot
plt.show()