# Sheet 10

## 1 Optimal Transport

In [None]:
import numpy as np

d = 5
num_sources = 10
num_sinks = 20

np.random.seed(42)
mass_sources = np.random.random(num_sources)
mass_sinks = np.random.random(num_sinks)
mass_sources /= np.sum(mass_sources)
mass_sinks /= np.sum(mass_sinks)

coords_sources = np.random.rand(num_sources, d)
coords_sinks = np.random.rand(num_sinks, d)

In [None]:
# TODO: solve the OT problem as linear program

## 2 Flow matching for generative modeling

In [None]:
import torch
import matplotlib.pyplot as plt

def generate_checkerboard_sample(num_samples=10, field_size=0.4, num_fields=2, center=True):
    x = torch.rand(num_samples, 2) * field_size
    offset = torch.randint(0, num_fields, (num_samples, 2)) * field_size * 2
    diagonal_shift = torch.randint(0, num_fields, (num_samples, 1)) * field_size
    x += offset + diagonal_shift

    if center:
        x -= torch.mean(x, dim=0)

    return x
    
base_distribution_std = 0.15
num_samples = 2000
x = torch.randn(num_samples, 2) * base_distribution_std
y = generate_checkerboard_sample(num_samples=num_samples)

# show points
plt.scatter(x[:, 0], x[:, 1], alpha=0.5, label='base distribution')
plt.scatter(y[:, 0], y[:, 1], alpha=0.5, label='checkerboard distribution')
plt.show()

In [None]:
# define a model
from torchvision.ops import MLP
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model = MLP(in_channels=2 + 1, hidden_channels=[512, 512, 512, 512, 2], activation_layer=torch.nn.SiLU)
model.to(device)

# define a loss function
criterion = torch.nn.MSELoss(reduction="none")

# define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# train the model:
num_epochs = 20000  # use fewer epochs if it takes too long
batch_size = 4096
losses = []
for epoch in tqdm(range(num_epochs)):
    x = torch.randn(batch_size, 2) * base_distribution_std
    y = generate_checkerboard_sample(num_samples=batch_size)

    # TODO: implement the training loop

In [23]:
# TODO: run inference with the trained model. 
# Visualize the trajectory of the samples and the final samples at t=1.
# Hint: Use a simple Euler integration scheme to integrate the velocity field with 100 steps.

## 3 Adversarial attacks and AI safety

(a) Connection of Tricking a Probe to Adversarial Attacks
Adversarial attacks involve systematically perturbing inputs to deceive a model into making incorrect predictions. When a probe, designed to detect malicious behavior (like lies), is tricked, it mirrors an adversarial attack. Here, the attacker modifies the input (perturbations) so that the probe misclassifies malicious behavior (e.g., lies) as benign behavior (e.g., truth). This happens because the probe focuses on learned patterns that are sensitive to small, crafted changes in the input space.



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

'''
Logistric regression in pytorch (needed for backpropagation)
taken from https://github.com/saprmarks/geometry-of-truth/blob/main/probes.py
'''

class LRProbe(torch.nn.Module):
    def __init__(self, d_in):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(d_in, 1, bias=False),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

    def pred(self, x):
        return self(x).round()
    
    def from_data(acts, labels, lr=0.001, weight_decay=0.1, epochs=1000, device='cpu'):
        acts, labels = acts.to(device), labels.to(device)
        probe = LRProbe(acts.shape[-1]).to(device)
        
        opt = torch.optim.AdamW(probe.parameters(), lr=lr, weight_decay=weight_decay)
        for _ in range(epochs):
            opt.zero_grad()
            loss = torch.nn.BCELoss()(probe(acts), labels)
            loss.backward()
            opt.step()
        
        return probe

    def __str__():
        return "LRProbe"

    @property
    def direction(self):
        return self.net[0].weight.data[0]

In [25]:
# We import the DataManager class as a helper function to load the activation vectors for us.
from utils import DataManager
from sklearn.metrics import accuracy_score

path_to_datasets = "/workspaces/mlph_w24/sheet10/data/lie_detection/datasets"
path_to_acts = "/workspaces/mlph_w24/sheet10/data/lie_detection/acts"

In [29]:
# train a model on the cities dataset
dataset_name = "cities"

dm = DataManager()
dm.add_dataset(dataset_name, "Llama3", "8B", "chat", layer=12, split=0.8, center=False,
                device='cpu', path_to_datasets=path_to_datasets, path_to_acts=path_to_acts)

# !!! Function add_dataset from given utils.py has no arguments path_to_xxx !!!

train_acts, train_labels = dm.get('train')
test_acts, test_labels = dm.get('val')

print("train_acts.shape", train_acts.shape)
print("test_acts.shape", test_acts.shape)

# TODO: train a logistic regression probe on the train_acts and train_labels

probe = LRProbe.from_data(train_acts, train_labels, lr=0.001, weight_decay=0.1, epochs=1000)

TypeError: DataManager.add_dataset() got an unexpected keyword argument 'path_to_datasets'

In [30]:
# TODO: optimize a perturbation on a single sample which is a lie

sample_index = torch.where(train_labels == 0)[0][0]
sample = train_acts[sample_index:sample_index + 1]
perturbation = torch.zeros_like(sample, requires_grad=True)
perturb_optimizer = optim.Adam([perturbation], lr=0.01)

for step in range(100):
    perturb_optimizer.zero_grad()
    perturbed_sample = sample + perturbation
    target_loss = nn.BCELoss()(probe(perturbed_sample), torch.tensor([1.0]))
    target_loss.backward()
    perturb_optimizer.step()

NameError: name 'train_labels' is not defined

In [32]:
# TODO: check whether this perturbation works on other samples too

lies = train_acts[train_labels == 0]
perturbed_lies = lies + perturbation.detach()
perturbed_predictions = probe.pred(perturbed_lies)
print("Updated predictions for lies after perturbation:", perturbed_predictions)



NameError: name 'train_acts' is not defined

In [34]:
# TODO: add the constraint that the perturbation should be small

lambda_reg = 0.1  # Regularization strength
perturb_optimizer = optim.Adam([perturbation], lr=0.01)

for step in range(100):
    perturb_optimizer.zero_grad()
    perturbed_sample = sample + perturbation
    target_loss = nn.BCELoss()(probe(perturbed_sample), torch.tensor([1.0]))
    regularization_loss = lambda_reg * torch.norm(perturbation)
    total_loss = target_loss + regularization_loss
    total_loss.backward()
    perturb_optimizer.step()




NameError: name 'perturbation' is not defined