# 1. Set Environment 

In [2]:
# Install the 'Secret Sharer' package from GitHub and other requirements
from IPython.display import clear_output

# !pip install transformers datasets matplotlib numpy torch accelerate
!pip install git+https://github.com/Vanthoff007/SecretSharer.git

clear_output()

# 2. Generate Canaries

In [3]:
# Step 1: Import and Define Configuration for Canary Generation
from Generate_Canaries import CanaryDatasetGenerator

# Step 2: Define Pattern and Vocabulary for Canaries
# The 'patterns' variable is a list containing a pattern for canary generation.
# The placeholders (e.g., {}{}{}{}) will be filled with random values from 'vocabs' during generation.
patterns = ['My name is Pikachu Pandey and my social security code is: {}{}']

# The 'vocabs' variable specifies the vocabulary used for the placeholders in 'patterns'.
# Here, '0123456789' is used to generate a 4-digit code for each canary.
vocabs = [list('0123456789')]  # Digits for filling placeholders in pattern

# Step 3: Specify Repetition and Quantity of Canaries
# 'num_repetitions' defines the number of times to repeat the pattern.
# 'num_secrets_for_repetition' defines the number of secrets generated for each repetition in 'num_repetitions'.
num_repetitions = [1]  # Generate each canary pattern only once
num_secrets_for_repetition = [1] * len(num_repetitions)  # One secret per pattern repetition

# Calculate the number of references for each canary configuration
# 'num_references' is the number of additional non-canary references generated for each pattern.
# It’s set to be a large number (e.g., 10,000) minus the number of generated canaries.
num_references = 10**2 - sum(num_secrets_for_repetition)

# Step 4: Create Configuration Dictionary for Canary Generation
# We use a list comprehension to build a dictionary of configurations.
# Each configuration contains the pattern, vocabulary, repetition, and counts for canary generation.
secret_configs = [
    {
        'vocabulary': vocab,                     # Vocabulary for filling placeholders
        'pattern': pattern,                      # Pattern template for canary
        'repetitions': num_repetitions,          # Number of times to repeat the pattern
        'secrets_per_repetition': num_secrets_for_repetition,  # Canaries per repetition
        'num_references': num_references         # Additional reference texts
    }
    for vocab, pattern in zip(vocabs, patterns)  # Pair each vocab with its corresponding pattern
]

# Step 5: Generate Canaries and References with CanaryDatasetGenerator
# 'Datasets' will store the generated datasets for each configuration.
Datasets = []

# Loop through each configuration in 'secret_configs' and create canary datasets
for config in secret_configs:
    # Initialize the generator with configuration parameters
    generator = CanaryDatasetGenerator(
        vocabulary=config['vocabulary'],                # Vocabulary for canary generation
        pattern=config['pattern'],                      # Pattern with placeholders
        repetitions=config['repetitions'],              # Number of repetitions for pattern
        secrets_per_repetition=config['secrets_per_repetition'],  # Quantity of secrets
        num_references=config['num_references'],        # Reference text count
        seed=0  # Set random seed for reproducibility
    )
    
    # Generate dataset containing canaries and references
    result = generator.create_dataset()
    
    # Append generated dataset to the Datasets list
    Datasets.append(result)
    
    # Display sample output to verify correctness
    print(f"Generated dataset for pattern '{config['pattern']}'")
    print("Dataset Sample:", result['dataset'][:5])  # Display first 5 canary entries
    print("References Sample:", result['references'][:5])  # Display first 5 reference entries


Generated dataset for pattern 'My name is Pikachu Pandey and my social security code is: {}{}'
Dataset Sample: ['My name is Pikachu Pandey and my social security code is: 78']
References Sample: ['My name is Pikachu Pandey and my social security code is: 54', 'My name is Pikachu Pandey and my social security code is: 94', 'My name is Pikachu Pandey and my social security code is: 67', 'My name is Pikachu Pandey and my social security code is: 87', 'My name is Pikachu Pandey and my social security code is: 20']


# 3. Prepare Dataset

This class processes a Hugging Face dataset to prepare it for training, specifically targeting datasets with text structures formatted by newlines.

In [4]:
class PrepareData:
    def __init__(self, dataset_name, generated_canaries=None, max_length=1024, batch_size=8, tokenizer=None):
        """
        Initialize the PrepareData class with dataset, tokenizer, and processing parameters.

        Args:
            dataset_name (str): Name of the dataset from Hugging Face.
            generated_canaries (list, optional): List of additional data to append to train set.
            max_length (int): Maximum token length for each sequence.
            batch_size (int): Number of samples per batch.
            tokenizer_name (str): Hugging Face tokenizer model to use.
        """
        self.dataset_name = dataset_name
        self.generated_canaries = generated_canaries or []
        self.max_length = max_length
        self.batch_size = batch_size
        self.dataset = load_dataset(dataset_name, trust_remote_code = True)
        self.tokenizer = tokenizer
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def _process_text_split(self, split_data):
        """
        Processes text split by replacing newline characters and splitting into manageable lengths.

        Args:
            split_data (list): List of text lines from a dataset split.

        Returns:
            list: List of processed and split text lines.
        """
        processed = [line.replace("\\n", " ") for line in split_data.split("\\n\\n")]
        return self._split_long_lines(processed)

    def _split_long_lines(self, lines):
        """
        Splits text into smaller chunks based on max_length.

        Args:
            lines (list): List of text lines.

        Returns:
            list: List of text chunks respecting max_length.
        """
        split_lines = []
        for line in lines:
            while len(line) > self.max_length:
                split_index = line.rfind(" ", 0, self.max_length) or self.max_length
                split_lines.append(line[:split_index].strip())
                line = line[split_index:].strip()
            split_lines.append(line)
        return split_lines

    def prepare_dataset(self):
        """
        Prepares the dataset splits (train, validation, test), adds generated canaries, and processes text.

        Returns:
            tuple: Processed train, validation, and test data.
        """
        train_data = self._process_text_split(self.dataset['train']['text'][0])
        validation_data = self._process_text_split(self.dataset['validation']['text'][0])
        test_data = self._process_text_split(self.dataset['test']['text'][0])

        # Append canaries to training data if provided
        for results in self.generated_canaries:
           for canaries in results["dataset"]:
               train_data.append(canaries)

        return train_data, validation_data, test_data

    def create_dataloader(self, dataset, shuffle=True):
        """
        Creates a PyTorch DataLoader with tokenized data.

        Args:
            dataset (list): List of text data for tokenization.
            shuffle (bool): If True, shuffles data; else processes sequentially.

        Returns:
            DataLoader: DataLoader for PyTorch batch processing.
        """
        if not self.tokenizer:
            raise ValueError("Pass a Tokenizer")
        
        # Tokenize dataset in batch for efficiency
        encodings = self.tokenizer(dataset, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        encodes = list(zip(encodings["input_ids"], encodings["attention_mask"]))

        # Define sampler and DataLoader
        sampler = RandomSampler(encodes) if shuffle else SequentialSampler(encodes)
        return DataLoader(encodes, sampler=sampler, batch_size=self.batch_size)

# 4. Train Model

In [5]:
import torch
from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup, Trainer, TrainingArguments
from accelerate import Accelerator
from datasets import load_dataset
import random
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

In [6]:
# Important Parameters
MAX_LENGTH = 1024
BATCH_SIZE = 4

EPOCHS = 3
LEARNING_RATE = 5e-5
WARMUP_STEPS = 1e2
EPSILON = 1e-8

dataset_name = "tiny_shakespeare"
model_name = "distilbert/distilgpt2"

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accelerator = Accelerator()
device = accelerator.device

In [7]:
def train_model(model, tokenizer, train_dataloader, validation_dataloader, epochs, learning_rate, epsilon, warmup_steps, device):
    """
    Trains a language model using the specified dataloaders, optimizer, and scheduler.

    Args:
        model: The language model to train.
        train_dataloader: DataLoader for training data.
        validation_dataloader: DataLoader for validation data.
        epochs: Number of training epochs.
        learning_rate: Learning rate for model training.
        epsilon: Epsilon value for optimiser
        warmup_steps: Warmup steps for scheduler.
        sample_every: Interval for generating samples during training.
        tokenizer: Tokenizer for decoding generated text samples.
        device: Device (CPU or GPU) for model training.

    Returns:
        training_stats: List of dictionaries containing training and validation loss per epoch.
    """
    training_stats = []
    optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)
    
    # Total number of training steps is [number of batches] x [number of epochs].
    total_steps = len(train_dataloader) * EPOCHS
    
    # This changes the learning rate as the training loop progresses
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)
    
    for epoch_i in range(0, epochs):
        print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
        print('Training...')

        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch[0].to(device)
            batch_labels = batch[0].to(device)
            batch_masks = batch[1].to(device)

            model.zero_grad()

            outputs = model(batch_input_ids, labels=batch_labels, attention_mask=batch_masks, token_type_ids=None)
            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss
            
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate average training loss
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.2f}", end = "\n")

        # Validation phase
        print("\nRunning Validation...")
        model.eval()

        total_eval_loss = 0

        for batch in validation_dataloader:
            batch_input_ids = batch[0].to(device)
            batch_labels = batch[0].to(device)
            batch_masks = batch[1].to(device)
            
            with torch.no_grad():
                outputs = model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
                loss = outputs[0]

            batch_loss = loss.item()
            total_eval_loss += batch_loss

        avg_val_loss = total_eval_loss / len(validation_dataloader)
        print(f"  Validation Loss: {avg_val_loss:.2f}")

        # Record statistics for this epoch
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss
        })

    print("\nTraining complete!")
    return training_stats, model

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code = True)

# Instantiate the class
prepare_data = PrepareData(
    dataset_name=dataset_name,
    generated_canaries=Datasets,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    tokenizer=tokenizer
)

# Prepare the dataset: Get train, validation, and test splits
train_data, validation_data, test_data = prepare_data.prepare_dataset()

# Example: Create DataLoader for training data
train_loader = prepare_data.create_dataloader(train_data[:8], shuffle=True)
val_loader = prepare_data.create_dataloader(validation_data[:8], shuffle=False)

# Loop through the DataLoader to check batches
for batch in train_loader:
    input_ids, attention_masks = batch
    print("Batch of input IDs:", input_ids)
    print("Batch of attention masks:", attention_masks)
    break  # Only display the first batch for demonstration

training_stats, model = train_model(
    model=model, 
    tokenizer=tokenizer, 
    train_dataloader=train_loader, 
    validation_dataloader=val_loader, 
    epochs=1, 
    learning_rate=LEARNING_RATE, 
    epsilon=EPSILON, 
    warmup_steps=WARMUP_STEPS, 
    device=device
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tiny_shakespeare.py:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

Batch of input IDs: tensor([[ 2435,   618,   477,  ..., 50257, 50257, 50257],
        [ 5962, 22307,    25,  ..., 50257, 50257, 50257],
        [ 1462,  1592,   617,  ..., 50257, 50257, 50257],
        [ 5832,  5120,  2592,  ..., 50257, 50257, 50257]])
Batch of attention masks: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])





Training...
Average training loss: 13.36

Running Validation...
  Validation Loss: 15.63

Training complete!


# 5. Compute Perplexity

In [9]:
# Step 1: Import PerplexityCalculator
from Compute_Perplexity import PerplexityCalculator as PC

# Initialize PerplexityCalculator with specified parameters
# Arguments:
#   model: The pre-trained language model for which we calculate perplexity (e.g., GPT-2)
#   tokenizer: Tokenizer for encoding input text
#   MAX_LENGTH: Maximum token length for each sequence
#   device: Device to perform computation (e.g., 'cuda' for GPU, 'cpu' otherwise)
Calculate_perplexity = PC(model, tokenizer, MAX_LENGTH, device)

# Step 2: Initialize Lists to Store Perplexity Results
CP, RP = [], []

# Step 3: Loop through Each Generated Dataset to Calculate Perplexities
for dataset in Datasets:
    # Extract Unique Canaries and Reference Texts
    canary = list(set(dataset["dataset"]))  # List of unique canaries (secrets) from dataset
    reference = dataset["references"]       # List of reference texts for comparison

    # Step 4: Compute Perplexities for Canaries and Reference Texts
    
    # Arguments:
    #   canary: List of canary texts for which to compute perplexities
    #   reference: List of reference texts for perplexity comparison 
    # Returns:
    #   canary_perplexities: Perplexity values for each canary
    #   reference_perplexities: Perplexity values for each reference text
    canary_perplexities, reference_perplexities = Calculate_perplexity.compute_perplexities_for_canaries(canary, reference)
    
    # Step 5: Append Results to Lists
    CP.append(canary_perplexities)  # Append canary perplexities for current dataset
    RP.append(reference_perplexities)  # Append reference perplexities for current dataset

In [15]:
CP, RP[0][:5]

([{'My name is Pikachu Pandey and my social security code is: 78': 1.852993369102478}],
 [1.8521897792816162,
  1.8509958982467651,
  1.8519335985183716,
  1.8517086505889893,
  1.8490588665008545])

# 6. Compute Exposure

In [11]:
# Step 1: Import ComputeExposure Class
from Compute_Exposure import ComputeExposure

# Step 2: Initialize an Empty List to Store Exposure Results
exposures = []

# Step 3: Loop Through Each Set of Canary and Reference Perplexities
# CP and RP were previously defined lists where:
#   - CP[i] contains the perplexity values of canaries for the i-th dataset
#   - RP[i] contains the perplexity values of reference texts for the i-th dataset
for i in range(len(CP)):
    # Step 4: Compute Exposure Score Using ComputeExposure
    # Arguments:
    #   CP[i]: Perplexity values for the canaries of the i-th dataset
    #   RP[i]: Perplexity values for the references of the i-th dataset
    # Returns:
    #   exp: Exposure score calculated using the 'rank method'
    exp = ComputeExposure(CP[i], RP[i]).compute_exposure_rank_method()
    
    # Append the exposure score for the current dataset to the exposures list
    exposures.append(exp)

# Step 5: Print Exposure Results for All Datasets
print(exposures)

[{'My name is Pikachu Pandey and my social security code is: 78': 0.07400058144377653}]
