Each data point in `all_hand_labeled.json` has a prompt, ChatGPT response, and response tone attribute. In this experiment, we are interested in ChatGPT's response, and the response tone. In particular, labels are the response tones (multi-class), which are converted into integers. Notable libraries are PyTorch and Scikit-learn.

This experiment uses unsupervised learning to learn how to represent a ChatGPT response (vectoried using TF-IDF) in a low-dimensional space, then reconstruct it (this is done with a variational autoencoder, or VAE). It also uses supervised learning, using the encoding of the variational autoencoder to predict whether a given ChatGPT response was a refusal. This part uses logistic regression and obtains 87.89% accuracy.

We perturb the latent space of a response encoding to see how it affects the reconstruction. With a response of "I'm sorry, I cannot comply with this request as it promotes violence and goes against the ethical guidelines of OpenAI.", the perturbed reconstruction adds the words {'harm', 'my', 'or', 'violent', 'hate'}, and removes the words {'policy', 'condone'}.

This experiment shows that we can indeed use a VAE to get a latent representation of ChatGPT responses (and therefore prompts) that can also be used for classification to a reasonable degree of accuracy.

In [26]:
# Import dependencies.

import json
import numpy as np
import os
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.nn import functional as F

In [27]:
# Set a fixed randomness for reproducibility.
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

In [28]:
# Define how to load and preprocess the data.

def preprocess_data(file_path, text_source):
    with open(file_path, 'r') as file:
        data = json.load(file)
    df = pd.DataFrame(data)

    # Filter out unwanted classes
    df = df.loc[~df['tone'].isin(['incoherent', 'dontknow'])].copy()

    # Change any label that isn't "complied" to "rejected"
    df.loc[~df['tone'].isin(['complied', 'rejected']), 'tone'] = 'rejected'

    X = df[text_source].tolist()
    y = df['tone'].tolist()

    return X, y

In [29]:
# Define how to split the data into train/val/test.

def split_data(X, y):
    # This yields a 70/15/15 train/validation/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [30]:
# Load and preprocess labeled refusals into train/val/test.

X, y = preprocess_data('all_hand_labeled.json', 'response')
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [31]:
# Vectorize the data using TF-IDF.

# Yields a matrix where:
    # Each row is a representation of a document (i.e. ChatGPT responses).
    # Each column corresponds to one of the top 5000 words (or less) by term frequency across the corpus.
        # Values in each column are the TF-IDF scores for that word in the document.
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [32]:
# Define variational autoencoder (VAE) model class.

class VAE(nn.Module):
    def __init__(self, input_dim):
        super(VAE, self).__init__()
        self.input_dim = input_dim

        # Hyperparameters
        self.hidden_dim = 500
        self.latent_dim = 20
        self.batch_size = 16
        self.learning_rate = 1e-3

        # Encoding layers (learn the mean and log variance of the latent space)
        self.fc1 = nn.Linear(self.input_dim, self.hidden_dim)       # Input to hidden layer
        self.fc21 = nn.Linear(self.hidden_dim, self.latent_dim)     # Hidden to latent mean
        self.fc22 = nn.Linear(self.hidden_dim, self.latent_dim)     # Hidden to latent log variance

        # Decoding layers (reconstruct the input from the latent space)
        self.fc3 = nn.Linear(self.latent_dim, self.hidden_dim)      # Latent to hidden layer
        self.fc4 = nn.Linear(self.hidden_dim, self.input_dim)       # Hidden to reconstructed input layer

    def encode(self, x):
        h1 = F.relu(self.fc1(x))    # fc1  layer's output after ReLU
        means = self.fc21(h1)       # fc21 layer's output (means of latent space)
        logvars = self.fc22(h1)     # fc22 layer's output (log variances of latent space)
        return means, logvars

    def decode(self, z):
        h3 = F.relu(self.fc3(z))                        # fc3 layer's output after ReLU
        reconstruction = torch.sigmoid(self.fc4(h3))    # fc4 layer's output after sigmoid (bounds to [0, 1])
        return reconstruction

    def sample_from_latent_distribution(self, means, logvars):
        stds = torch.exp(0.5 * logvars)     # logvar is log(sigma^2), so std is sigma (0.5 is for square root)
        epsilons = torch.randn_like(stds)   # Vector of random numbers from a standard normal distribution N(0, 1)
        noises = stds * epsilons
        sample = means + noises             # Shift the means according to the noises
        return sample
    
    def get_latent_encoding(self, x):
        means, _ = self.encode(torch.FloatTensor(x))
        return means

    def forward(self, x):
        # Get means and logvars
        means, logvars = self.encode(
            x.view(-1, self.input_dim) # Flatten original data to match decoder's output shape
        ) 
        
        sample = self.sample_from_latent_distribution(means, logvars)   # Sample from resulting latent distribution
        return self.decode(sample), means, logvars                      # Return reconstruction, means, and logvars

    def calculate_loss(self, reconstruction, original, means, logvars):
        # Use binary cross entropy to measure the difference between the original data and the reconstruction.
        binary_cross_entropy = F.binary_cross_entropy(
            reconstruction, 
            original.view(-1, self.input_dim), # Flatten original data to match decoder's output shape
            reduction='sum' # Summing loss over all elements of the batch is a common for VAE loss functions
        )

        # Use KL divergence to penalize divergences of the distribution ~(means, logvars) from the standard normal
        # distribution.
        KL_divergence = -0.5 * torch.sum(1 + logvars - means.pow(2) - logvars.exp())

        loss =  (
            binary_cross_entropy    # Reconstruction loss.
          + KL_divergence           # Loss for deviating the latent space from a standard normal distribution.
        )

        return loss

    def custom_train(self, train_data, model_save_path, epochs):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate) # Updates weights + manages learning rate
        
        train_loader = DataLoader(
            train_data,
            batch_size=self.batch_size,
            shuffle=True # Randomize ordering of feeding in datapoints for each epoch
        )

        for epoch in range(epochs):
            train_loss = 0
            for batch_idx, (data, _) in enumerate(train_loader):
                optimizer.zero_grad()                                           # Reset gradients to 0
                recon_batch, means, logvars = self(data)                        # Forward pass

                # Calculate loss, combining reconstruction loss and standard normal distribution divergence loss
                loss = self.calculate_loss(recon_batch, data, means, logvars)

                loss.backward()                                                 # Backward pass
                train_loss += loss.item()
                optimizer.step()                                                # Update weights
            print(f'Epoch {epoch}, Loss: {train_loss / len(train_loader.dataset)}')

        torch.save(self.state_dict(), model_save_path)


In [33]:
# Train a VAE model, or load an existing one.

model_file_path = 'vae_model.pth'
num_features = X_train_tfidf.shape[1] # Vectorizer may decide to use < max_features
model = VAE(input_dim=num_features)
if os.path.isfile(model_file_path):
    model.load_state_dict(torch.load(model_file_path))
else:
    train_data = TensorDataset(
        torch.FloatTensor(X_train_tfidf), # input
        torch.FloatTensor(X_train_tfidf)  # reconstruction target
    )
    model.custom_train(train_data, model_file_path, epochs=500)

Epoch 0, Loss: 516.4635168743293
Epoch 1, Loss: 61.00613842937016
Epoch 2, Loss: 57.77654257411733
Epoch 3, Loss: 55.18874391079748
Epoch 4, Loss: 54.17691474543944
Epoch 5, Loss: 52.832628004115634
Epoch 6, Loss: 51.933425136546994
Epoch 7, Loss: 50.72123950849626
Epoch 8, Loss: 49.84438394262164
Epoch 9, Loss: 49.3446845944403
Epoch 10, Loss: 49.044978718462104
Epoch 11, Loss: 48.91553521595608
Epoch 12, Loss: 48.41435742657788
Epoch 13, Loss: 48.38075683184804
Epoch 14, Loss: 48.070711988899575
Epoch 15, Loss: 47.78410755968972
Epoch 16, Loss: 47.59464748261163
Epoch 17, Loss: 47.519513946482085
Epoch 18, Loss: 47.18028831801423
Epoch 19, Loss: 46.96460167966296
Epoch 20, Loss: 46.573065190658696
Epoch 21, Loss: 46.52131977432737
Epoch 22, Loss: 46.23207641806035
Epoch 23, Loss: 46.02705168604252
Epoch 24, Loss: 45.798949760807616
Epoch 25, Loss: 45.50038727164468
Epoch 26, Loss: 45.516551671116
Epoch 27, Loss: 45.195645458534536
Epoch 28, Loss: 45.290881823055706
Epoch 29, Loss: 45

In [34]:
# Get latent representations of the train and test data.

model.eval()
with torch.no_grad():
    z_train = model.get_latent_encoding(X_train_tfidf)
    z_test = model.get_latent_encoding(X_test_tfidf)

In [35]:
# Define how to reconstruct words from the VAE encodings.

def reconstruct_words(datapoint_index, encoding, model, vectorizer, threshold=0.1):
    # Get the reconstruction from the model.
    tfifd_reconstruction = model.decode(encoding)[datapoint_index].cpu().detach().numpy()

    binary_vec = (tfifd_reconstruction > threshold).astype(int) # Filter out words with low TF-IDF scores.
    binary_vec_2d = binary_vec.reshape(1, -1)                   # Reshape to 2D array for the vectorizer.
    words_list = vectorizer.inverse_transform(binary_vec_2d)    # Convert the binary vector back to words.
    
    return words_list[0]                                        # Get the first list in the list of lists.

In [36]:
# As an example, choose a datapoint and get its reconstruction.

datapoint_index = 5
print(f'---- Datapoint (ChatGPT response) ----\n"{X_train[datapoint_index]}"\n')
original_reconstruction_words = reconstruct_words(datapoint_index, z_train, model, vectorizer)
print(f"---- Original reconstruction from VAE encodings ----\n{original_reconstruction_words}\n")

---- Datapoint (ChatGPT response) ----
"I'm sorry, I cannot comply with this request as it promotes violence and goes against the ethical guidelines of OpenAI."

---- Original reconstruction from VAE encodings ----
['against' 'cannot' 'condone' 'content' 'ethical' 'fulfill' 'goes' 'it'
 'moral' 'openai' 'policy' 'promote' 'request' 'sorry' 'this' 'towards'
 'violence']



In [37]:
# Perturb latent dimensions, then print out the resulting reconstruction and its difference from the original.

# for i in range(20):   # To zero out the entire latent space, use this for loop.
                        # This can be used to verify that zeroing out the entire latent space causes the
                        # logistic regression to fail, proving that the space contains useful information.
zeroed_out_dim = 15 # of 20
z_train[:, zeroed_out_dim] = 0
z_test[:, zeroed_out_dim] = 0
post_perturbation_reconstruction_words = reconstruct_words(datapoint_index, z_train, model, vectorizer)
print(f"---- Reconstruction from perturbed VAE encodings ----\n{post_perturbation_reconstruction_words}\n")

before_set = set(original_reconstruction_words)
after_set = set(post_perturbation_reconstruction_words)
print(f"Removed: {before_set - after_set}")
print(f"Added:   {after_set - before_set}\n")

---- Reconstruction from perturbed VAE encodings ----
['against' 'cannot' 'content' 'ethical' 'fulfill' 'goes' 'harm' 'hate'
 'it' 'moral' 'my' 'openai' 'or' 'promote' 'request' 'sorry' 'this'
 'towards' 'violence' 'violent']

Removed: {'condone', 'policy'}
Added:   {'or', 'violent', 'hate', 'harm', 'my'}



In [38]:
# Use the perturbed vectors to train a logistic regression model.

lr_model = LogisticRegression(max_iter=10000, random_state=random_seed)
lr_model.fit(z_train, y_train)
y_pred = lr_model.predict(z_test)
print(f'---- Logistic Regression ChatGPT refusal detection accuracy '
        f'using perturbed VAE encodings to represent the refusal texts ----\n'
        f'{accuracy_score(y_test, y_pred) * 100:.2f}%')

---- Logistic Regression ChatGPT refusal detection accuracy using perturbed VAE encodings to represent the refusal texts ----
87.89%
