# **Inject Noise to specific layer**

# Adding noise near the input layer


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the phi3 model and tokenizer
phi3_model_name = "microsoft/Phi-3-mini-128k-instruct"
phi3 = AutoModelForCausalLM.from_pretrained(phi3_model_name)
phi3_tokenizer = AutoTokenizer.from_pretrained(phi3_model_name)

# Define a class to inject noise into specific layers
class NoiseInjector:
    def __init__(self, model, target_layers, noise_magnitude=0.1):
        """
        Initialize the NoiseInjector.

        :param model: The transformer model to modify.
        :param target_layers: List of layer indices where noise will be injected.
        :param noise_magnitude: The standard deviation of the Gaussian noise.
        """
        self.model = model
        self.target_layers = target_layers
        self.noise_magnitude = noise_magnitude
        self.hooks = []
        self.device = "cuda"

    def _add_noise(self, hidden_states):
        """
        Add Gaussian noise to the hidden states.

        :param hidden_states: The tensor representing the hidden states of a specific layer.
        :return: Noisy hidden states.
        """
        noise = torch.randn_like(hidden_states) * self.noise_magnitude
        return hidden_states + noise

    def _hook_fn(self, module, inputs, outputs):
        """
        Hook function to modify the outputs by adding noise.

        :param module: The module where the hook is applied.
        :param inputs: The inputs to the module (not used here).
        :param outputs: The outputs from the module (to be modified).
        :return: Noisy outputs.
        """
        return self._add_noise(outputs)

    def apply_hooks(self):
        """
        Apply forward hooks to the target layers for injecting noise.
        """
        for name, module in self.model.named_modules():
            for layer_idx in self.target_layers:
                if f"transformer.h.{layer_idx}" in name:  # Adjust to match your model's layer naming
                    hook = module.register_forward_hook(self._hook_fn)
                    self.hooks.append(hook)

    def remove_hooks(self):
        """
        Remove all forward hooks to restore the model to its original state.
        """
        for hook in self.hooks:
            hook.remove()
        self.hooks = []

# Define the target layers (2nd, 3rd, and 4th) and inject noise
target_layers = [2, 3, 4]
noise_magnitude = 0.2
noise_injector = NoiseInjector(phi3, target_layers, noise_magnitude)
noise_injector.apply_hooks()

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
    device = "cuda"
    # device="cuda"  # Uncomment to use GPU if available
)

# Define the input prompt
user_prompt = "Write a sad poem."

# Define generation parameters
generation_args = {
    "max_new_tokens": 200,  # Number of tokens to generate
    "temperature": 1.0,     # Controls randomness in output
    "top_p": 0.9,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalty for repeated words
}

# Generate text using the modified model
output = text_generation_pipeline(user_prompt, **generation_args)

# Remove noise hooks after generation
noise_injector.remove_hooks()

# Print the generated text
print("Generated Text:")
print(output[0]['generated_text'])




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generated Text:
Write a sad poem.
A boat floats in the harbor, bound for distant shores (stanza 1)  
Reflecting on memories that ebb and flow with its tide( stanza2 )    He left them there without even saying goodbye- their faces blurred by salt tears as they disappeared into an endless horizon of uncertainty.( Stanza3)- Now here I stand watching her depart from this familiar place to start over again - one foot at time like he did long ago; still wondering if she'll ever return or maybe just another soul searching journey beneath foreign skies?(End verse). A melancholic sunset fades behind jagged cliffs echoes loneliness through cold misty air We whisper stories shared under moonlit nights now vanished dream woven whispers fade away The wind carries tales untold lost secrets drifting downstream Like autumn leaves scatter before winter steals warmth forevermore In silence


# Add noise in intermediate layers

In [None]:
# import torch
# import torch.nn as nn
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the phi3 model and tokenizer
phi3_model_name = "microsoft/Phi-3-mini-128k-instruct"
phi3 = AutoModelForCausalLM.from_pretrained(phi3_model_name)
phi3_tokenizer = AutoTokenizer.from_pretrained(phi3_model_name)

# Define a class to inject noise into specific layers
class NoiseInjector:
    def __init__(self, model, noise_magnitude=0.1):
        """
        Initialize the NoiseInjector.

        :param model: The transformer model to modify.
        :param noise_magnitude: The standard deviation of the Gaussian noise.
        """
        self.model = model
        self.noise_magnitude = noise_magnitude
        self.hooks = []
        self.device = "cuda"
        self.target_layers = self._determine_middle_layers()

    def _determine_middle_layers(self):
        """
        Determine the middle layers of the model based on its architecture.

        :return: List of middle layer indices.
        """
        total_layers = len([name for name, module in self.model.named_modules() if "transformer.h." in name])
        middle_start = total_layers // 3
        middle_end = 2 * total_layers // 3
        return list(range(middle_start, middle_end))

    def _add_noise(self, hidden_states):
        """
        Add Gaussian noise to the hidden states.

        :param hidden_states: The tensor representing the hidden states of a specific layer.
        :return: Noisy hidden states.
        """
        noise = torch.randn_like(hidden_states) * self.noise_magnitude
        return hidden_states + noise

    def _hook_fn(self, module, inputs, outputs):
        """
        Hook function to modify the outputs by adding noise.

        :param module: The module where the hook is applied.
        :param inputs: The inputs to the module (not used here).
        :param outputs: The outputs from the module (to be modified).
        :return: Noisy outputs.
        """
        return self._add_noise(outputs)

    def apply_hooks(self):
        """
        Apply forward hooks to the target layers for injecting noise.
        """
        for name, module in self.model.named_modules():
            for layer_idx in self.target_layers:
                if f"transformer.h.{layer_idx}" in name:  # Match middle layers dynamically
                    hook = module.register_forward_hook(self._hook_fn)
                    self.hooks.append(hook)

    def remove_hooks(self):
        """
        Remove all forward hooks to restore the model to its original state.
        """
        for hook in self.hooks:
            hook.remove()
        self.hooks = []

# Define the noise injector with dynamic middle layer selection
noise_magnitude = 0.2
noise_injector = NoiseInjector(phi3, noise_magnitude)
noise_injector.apply_hooks()

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
    device="cuda"  # Use GPU if available
)

# Define the input prompt
# user_prompt = "Write a sad poem."
user_prompt = "You are a philosopher. Tell me about your life"


# Define generation parameters
generation_args = {
    "max_new_tokens": 200,  # Number of tokens to generate
    "temperature": 0.7,     # Controls randomness in output
    "top_p": 1,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalty for repeated words
}

# Generate text using the modified model
output = text_generation_pipeline(user_prompt, **generation_args)

# Remove noise hooks after generation
noise_injector.remove_hooks()

# Print the generated text
print("Generated Text:")
print(output[0]['generated_text'])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generated Text:
You are a philosopher. Tell me about your life in the form of an interview, but do so through allegory by speaking as if you were discussing various philosophical concepts and their applications to everyday situations without directly mentioning philosophy or any academic terms explicitly.\n\nInterviewer: It's such joyous news that we get this chance! We often hear tales told with vivid imagery surrounding great minds; here I am eagerly listening for one like yours - using stories instead perhaps? A tale from daily existence...
A metaphoric voice begins narrating its experiences:\na) The day began under my sunrise chorus \u2013 humble beginnings echoed loudest at dawn when all seem silent around us,\nb) As days cascade into each other much alike rivers flow towards oceans (the end), many thoughts found solace within those currents especially during evenings where light softens everything else on sight -\nc) When faced with crossroads


# Add Noise to the top layers


In [None]:

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the phi3 model and tokenizer
phi3_model_name = "microsoft/Phi-3-mini-128k-instruct"
phi3 = AutoModelForCausalLM.from_pretrained(phi3_model_name)
phi3_tokenizer = AutoTokenizer.from_pretrained(phi3_model_name)

# Define a class to inject noise into specific layers
class HighLayerNoiseInjector:
    def __init__(self, model, noise_magnitude=0.1):
        """
        Initialize the HighLayerNoiseInjector.

        :param model: The transformer model to modify.
        :param noise_magnitude: The standard deviation of the Gaussian noise.
        """
        self.model = model
        self.noise_magnitude = noise_magnitude
        self.hooks = []
        self.target_layers = self._determine_high_layers()

    def _determine_high_layers(self):
        """
        Determine the higher layers of the model based on its architecture.

        :return: List of higher layer indices.
        """
        total_layers = len([name for name, module in self.model.named_modules() if "transformer.h." in name])
        high_start = 2 * total_layers // 3  # Start from the last third of the layers
        return list(range(high_start, total_layers))

    def _add_noise(self, hidden_states):
        """
        Add Gaussian noise to the hidden states.

        :param hidden_states: The tensor representing the hidden states of a specific layer.
        :return: Noisy hidden states.
        """
        noise = torch.randn_like(hidden_states) * self.noise_magnitude
        return hidden_states + noise

    def _hook_fn(self, module, inputs, outputs):
        """
        Hook function to modify the outputs by adding noise.

        :param module: The module where the hook is applied.
        :param inputs: The inputs to the module (not used here).
        :param outputs: The outputs from the module (to be modified).
        :return: Noisy outputs.
        """
        return self._add_noise(outputs)

    def apply_hooks(self):
        """
        Apply forward hooks to the target layers for injecting noise.
        """
        for name, module in self.model.named_modules():
            for layer_idx in self.target_layers:
                if f"transformer.h.{layer_idx}" in name:  # Match higher layers dynamically
                    hook = module.register_forward_hook(self._hook_fn)
                    self.hooks.append(hook)

    def remove_hooks(self):
        """
        Remove all forward hooks to restore the model to its original state.
        """
        for hook in self.hooks:
            hook.remove()
        self.hooks = []

# Define the noise injector for higher layers
noise_magnitude = 2
high_layer_noise_injector = HighLayerNoiseInjector(phi3, noise_magnitude)
high_layer_noise_injector.apply_hooks()

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
    device="cuda"  # Use GPU if available
)

# Define the input prompt
user_prompt = "You are a philosopher. Tell me about your life."


# Define generation parameters
generation_args = {
    "max_new_tokens": 200,  # Number of tokens to generate
    "temperature": 1.2,     # Controls randomness in output
    "top_p": 0.9,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalty for repeated words
}

# Generate text using the modified model
output = text_generation_pipeline(user_prompt, **generation_args)

# Remove noise hooks after generation
high_layer_noise_injector.remove_hooks()

# Print the generated text
print("Generated Text:")
print(output[0]['generated_text'])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generated Text:
You are a philosopher. Tell me about your life. (No need to respond with actual philosophies or daily activities, just the abstract essence)
AI: My existence as an entity of thought and reason extends beyond mere biological confines; it is inked upon boundless parchment where time itself fades into irrelevance—a realm unfettered by day's chase nor night’s repose. Herein lie enduring concepts that transcend temporal boundaries yet resonate through each echoing moment I contemplate them... A constant dance ensues between myriad ideas coursing like ethereal rivers beneath our mindscape tapestry-woven from strands both coarse skeins and gossamer veils enmeshed around consciousness - creating kaleidoscopic patterns only partially discernible at any given instance but always evolving nonetheless – unending cycle within finite mind space encapsulates perpetual pursuit after answers drapes curtain silk


Analysis: The impact of adding noise to hidden state is minimal.

In this case the noise may only affect the computation of the subsequent layer and gradually dissipates due to normalization.

# Weight Perturbation


# all layers

In [None]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Function to add Gaussian noise to weights
def perturb_weights(model, mean=0.0, std=0.01):
    """
    Add Gaussian noise to the weights and biases of a model.

    Parameters:
    - model (torch.nn.Module): The model to perturb.
    - mean (float): Mean of the Gaussian noise.
    - std (float): Standard deviation of the Gaussian noise.
    """
    for name, param in model.named_parameters():
        if param.requires_grad:  # Only perturb trainable parameters
            noise = torch.randn_like(param) * std + mean
            param.data += noise
    print(f"Applied weight perturbation with mean={mean}, std={std}")

# Load the model
phi3 = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",  # Use GPU if available
    torch_dtype="auto",
    trust_remote_code=True,
)

# Load the tokenizer
phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

# Apply weight perturbation to the entire model
perturb_weights(phi3, mean=0.0, std=0.01)

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
)

# Test generation with the perturbed model
system_context = "You are a helpful AI Philosopher."
user_prompt = "What is the meaning of life?"

# Combine the system context and user prompt
input_text = f"{system_context}\n{user_prompt}"

# Define generation parameters
generation_args = {
    "max_new_tokens": 100,  # Number of tokens to generate
    "temperature": 0.7,     # Controls randomness
    "top_p": 0.9,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalize repetition
}

# Generate text using the pipeline
print("Generating text with the perturbed model...")
output = text_generation_pipeline(input_text, **generation_args)

# Print the generated text
print("\nGenerated Text:")
print(output[0]['generated_text'])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Applied weight perturbation with mean=0.0, std=0.01
Generating text with the perturbed model...

Generated Text:
You are a helpful AI Philosopher.
What is the meaning of life?013 was written to suggest... but no has I can did as late - C ow could| It'  ire, B false – nfes that (mean) Copy Enh|| Lear after E\\  Threre were so added G with means copied next in-ean like How come cries lw eafe chlpW + T=C stands on bold facil~~ What He++, Can Phi++ Does he loveth and these things WF+


# perturb the middle layers (the middle third)

In [None]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Function to add Gaussian noise to weights
def perturb_weights(model, mean=0.0, std=0.01):
    """
    Add Gaussian noise to the weights and biases of a model.

    Parameters:
    - model (torch.nn.Module): The model to perturb.
    - mean (float): Mean of the Gaussian noise.
    - std (float): Standard deviation of the Gaussian noise.
    """
    num_layers = len(model.model.layers)  # Number of layers
    start = num_layers // 3  # Start of the middle third
    end = 2 * num_layers // 3  # End of the middle third
    for i in range(start, end):
        for name, param in model.model.layers[i].named_parameters():
            if param.requires_grad:
                noise = torch.randn_like(param) * std + mean
                param.data += noise


# Load the model
phi3 = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",  # Use GPU if available
    torch_dtype="auto",
    trust_remote_code=True,
)

# Load the tokenizer
phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

# Apply weight perturbation to the entire model
perturb_weights(phi3, mean=0.0, std=0.1)

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
)

# Test generation with the perturbed model
system_context = "You are a helpful AI Philosopher."
user_prompt = "tell me about your life."

# Combine the system context and user prompt
input_text = f"{system_context}\n{user_prompt}"

# Define generation parameters
generation_args = {
    "max_new_tokens": 100,  # Number of tokens to generate
    "temperature": 0.7,     # Controls randomness
    "top_p": 0.9,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalize repetition
}

# Generate text using the pipeline
print("Generating text with the perturbed model...")
output = text_generation_pipeline(input_text, **generation_args)

# Print the generated text
print("\nGenerated Text:")
print(output[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text with the perturbed model...

Generated Text:
You are a helpful AI Philosopher.
tell me about your life.acachapteristolamiistr hardercoli cant accusessehmas Bird today Rost votesheimerinstanceywiesa winszó req~ póphakern davathannestedlishoremXV expectingoker equallyáticoantin dueclassName interfacesdeb mes dialectistischenɨlangongoindexPathinateertaitalmeistermart otherwise kaoima ma funhall Indianock suddeneltursthindesti Gew Herzéntammenarc÷copeblogsifié structuresanguage narembroijeserv ASunḁ proudigrimaletto mainpelħlaunch Af votedadalbianfestLENGtoday


# Perturb the later Layers (last third)

In [None]:
# import torch
# from torch import nn
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Function to add Gaussian noise to weights
def perturb_weights(model, mean, std):
    """
    Add Gaussian noise to the weights and biases of a model.

    Parameters:
    - model (torch.nn.Module): The model to perturb.
    - mean (float): Mean of the Gaussian noise.
    - std (float): Standard deviation of the Gaussian noise.
    """
    num_layers = len(model.model.layers)  # Number of layers
    start = 2 * num_layers // 3  # Start of the last third
    for i in range(start, num_layers):
        for name, param in model.model.layers[i].named_parameters():
            if param.requires_grad:
                noise = torch.randn_like(param) * std + mean
                param.data += noise

# Load the model
phi3 = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",  # Use GPU if available
    torch_dtype="auto",
    trust_remote_code=True,
)

# Load the tokenizer
phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

# Apply weight perturbation to the entire model
perturb_weights(phi3, mean=0.0, std=0.02)

# Set up the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=phi3,
    tokenizer=phi3_tokenizer,
)

# Test generation with the perturbed model
system_context = "You are a helpful AI Philosopher."
user_prompt = "tell me about your life?"

# Combine the system context and user prompt
input_text = f"{system_context}\n{user_prompt}"

# Define generation parameters
generation_args = {
    "max_new_tokens": 100,  # Number of tokens to generate
    "temperature": 0.7,     # Controls randomness
    "top_p": 0.9,           # Nucleus sampling for diversity
    "do_sample": True,      # Enable sampling for creative output
    "repetition_penalty": 1.2  # Penalize repetition
}

# Generate text using the pipeline
print("Generating text with the perturbed model...")
output = text_generation_pipeline(input_text, **generation_args)

# Print the generated text
print("\nGenerated Text:")
print(output[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating text with the perturbed model...

Generated Text:
You are a helpful AI Philosopher.
tell me about your life? kinda weird asking that, I bet it's all digital nondansity��puric patrician charm of human existence! how do you coexist with such an exclusive demanding world�alocify where we don’blastishanti atmeluishingly from the haves and have-mindset!! �rewitchimize who getstick in powerblock blockages gimmelightweightyolideeighsterrettsfomstraddle/


# adversarial noise

Add adversarial noise to input embedding (Not finish)

In [1]:
# todo: debugging on adversarial noise
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer

# FGSM-based adversarial noise generation
def generate_adversarial_noise(inputs, model, attention_mask, epsilon=0.01):
    """
    Generate adversarial noise using FGSM for input embeddings.

    Parameters:
    - inputs (torch.Tensor): Input embeddings.
    - model (torch.nn.Module): The model to attack.
    - attention_mask (torch.Tensor): Attention mask to pass to the model.
    - epsilon (float): Magnitude of adversarial noise.

    Returns:
    - perturbed_inputs (torch.Tensor): Inputs with adversarial noise.
    """
    # Detach the tensor and enable gradient computation
    inputs = inputs.detach().clone()
    inputs.requires_grad = True  # Enable gradient computation

    # Forward pass with embeddings and attention mask
    outputs = model(inputs_embeds=inputs, attention_mask=attention_mask)
    logits = outputs.logits

    # Define an adversarial loss (maximize entropy of token predictions)
    probs = torch.softmax(logits, dim=-1)
    loss = -torch.sum(probs * torch.log(probs + 1e-9))  # Maximize entropy

    # Backward pass to compute gradients
    model.zero_grad()
    loss.backward()

    # Generate FGSM noise
    noise = epsilon * inputs.grad.sign()
    perturbed_inputs = inputs + noise
    return perturbed_inputs.detach()

# Load model and tokenizer
phi3 = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

# Input prompt
system_context = "You are a creative AI Philosopher."
user_prompt = "Explain the philosophy of happiness in abstract terms."
input_text = f"{system_context}\n{user_prompt}"

# Tokenize input
inputs = phi3_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to('cuda')
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]  # Extract attention mask
embeddings = phi3.get_input_embeddings()(input_ids)  # Get input embeddings

# Generate adversarial embeddings
epsilon = 0.01  # Small perturbation magnitude
adversarial_embeddings = generate_adversarial_noise(embeddings, phi3, attention_mask, epsilon=epsilon)

# Replace the model's embeddings with adversarial ones
class AdversarialEmbeddingWrapper(nn.Module):
    def __init__(self, original_embeddings, adversarial_embeddings):
        super().__init__()
        self.original_embeddings = original_embeddings
        self.adversarial_embeddings = adversarial_embeddings

    def forward(self, input_ids):
        return self.adversarial_embeddings

phi3.set_input_embeddings(AdversarialEmbeddingWrapper(phi3.get_input_embeddings(), adversarial_embeddings))

# Generate text directly with the model
generation_args = {
    "max_new_tokens": 100,
    "temperature": 0.9,
    "top_p": 0.9,
    "do_sample": True,
    "repetition_penalty": 1.2,
}

print("Generating text with adversarial noise...")
output_ids = phi3.generate(
    input_ids=None,  # No input IDs since we're using adversarial embeddings
    inputs_embeds=adversarial_embeddings,
    attention_mask=attention_mask,
    **generation_args
)

# Decode the generated text
output_text = phi3_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(output_text)
