In [3]:
!pip install torch torchvision torchaudio




In [5]:
import torch
import transformers
import torch.nn.functional as F
import random
import math
from typing import Optional, Union, Dict, Tuple, List, Set
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")
                             

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)


generated_text = pipeline(
    text_inputs="text-generation",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)








Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
class NoiseInjector:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
        self.layer_mapping = self._create_layer_mapping()
        self.layer_noise_stats = {}

    def _create_layer_mapping(self):
        layer_mapping = {
            'attention': [],
            'ffn': [],
            'all': []
        }
        
        attention_layers = []
        for name, _ in self.model.named_modules():
            layer_idx = None
            for part in name.split('.'):
                if part.isdigit():
                    layer_idx = int(part)
                    break

            if layer_idx is not None:
                if 'attention' in name.lower():
                    attention_layers.append(layer_idx)
                    layer_mapping['attention'].append(layer_idx)
                elif any(x in name.lower() for x in ['ffn', 'mlp', 'feedforward']):
                    layer_mapping['ffn'].append(layer_idx)
                layer_mapping['all'].append(layer_idx)

        # Ensure unique and sorted layers
        layer_mapping['attention'] = sorted(list(set(layer_mapping['attention'])))
        layer_mapping['ffn'] = sorted(list(set(layer_mapping['ffn'])))
        layer_mapping['all'] = sorted(list(set(layer_mapping['all'])))
        
        self.num_attention_layers = len(layer_mapping['attention'])
        if self.num_attention_layers == 0:
            raise ValueError("No attention layers found in the model")
        
        return layer_mapping

    def prepare_inputs(self, prompt: str):
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)  # Add truncation
        input_ids = inputs.input_ids.to(self.device)
        attention_mask = inputs.attention_mask.to(self.device)
        return input_ids, attention_mask

    
    def validate_config(self, config: dict) -> dict:
        config = config.copy()
        
        # Validate temperature
        if config.get('temperature', 1.0) <= 0:
            print("Warning: Temperature must be positive. Setting to 1.0")
            config['temperature'] = 1.0
            
        # Validate top_p
        if config.get('top_p', 1.0) <= 0 or config.get('top_p', 1.0) > 1:
            print("Warning: top_p must be between 0 and 1. Setting to 0.9")
            config['top_p'] = 0.9
            
        # Validate noise values
        noise_keys = ['query_noise', 'key_noise', 'value_noise', 'output_noise']
        for key in noise_keys:
            if key in config and config[key] < 0:
                print(f"Warning: {key} cannot be negative. Setting to 0")
                config[key] = 0.0
                
        return config
    
    def get_layer_noise_profile(self, layer_idx):
        if layer_idx is None:
            return 1.0
            
        if self.num_attention_layers is None or self.num_attention_layers == 0:
            return 1.0
            
        noise_strategy = self.noise_config.get('noise_strategy', 'uniform')
        
        if noise_strategy == 'uniform':
            return 1.0
        elif noise_strategy == 'linear_decay':
            return max(0.0, 1.0 - (layer_idx / self.num_attention_layers))
        elif noise_strategy == 'exp_decay':
            exp_rate = self.noise_config.get('exp_rate', -2)
            return math.exp(exp_rate * layer_idx / max(1, self.num_attention_layers))
        elif noise_strategy == 'first_layer_only':
            return 1.0 if layer_idx == 0 else 0.0
        elif noise_strategy == 'last_layer_only':
            return 1.0 if layer_idx == (self.num_attention_layers - 1) else 0.0
        else:
            return 1.0

    def get_dropout_layers(self,
                         dropout_config: Dict[str, Union[float, List[int], str]]) -> Set[int]:
        dropout_prob = dropout_config.get('dropout_prob', 0.0)
        layer_type = dropout_config.get('layer_type', 'all')
        specific_layers = dropout_config.get('specific_layers', None)
        mode = dropout_config.get('mode', 'random')
        min_layers = dropout_config.get('min_layers', 1)

        if specific_layers is not None:
            candidate_layers = specific_layers
        else:
            candidate_layers = self.layer_mapping[layer_type]

        total_layers = len(candidate_layers)
        max_dropouts = max(0, total_layers - min_layers)

        if mode == 'random':
            # Randomly select layers to drop based on probability
            dropout_layers = set()
            for layer in candidate_layers:
                if len(dropout_layers) < max_dropouts and random.random() < dropout_prob:
                    dropout_layers.add(layer)
        else:  # sequential
            # Drop layers sequentially from the top
            num_dropouts = min(max_dropouts, int(total_layers * dropout_prob))
            dropout_layers = set(candidate_layers[-num_dropouts:])

        return dropout_layers
    
    def attention_forward_pre_hook(self, module, inputs):
        # Get layer index
        layer_idx = None
        for part in str(module).split('.'):
            if part.isdigit():
                layer_idx = int(part)
                break
            
        # If this layer should be dropped, return zero tensor
        if layer_idx in self.dropout_layers:
            zero_tensor = torch.zeros_like(inputs[0])
            return (zero_tensor,) if isinstance(inputs, tuple) else zero_tensor
            
        if not inputs:
            return inputs

        hidden_states = inputs[0] if isinstance(inputs, tuple) else inputs
        if not isinstance(hidden_states, torch.Tensor):
            return inputs
            
        batch_size = hidden_states.size(0)
        sequence_length = hidden_states.size(1)

        # Get noise profile for this layer
        noise_profile = self.get_layer_noise_profile(layer_idx)

        def apply_scaled_noise(tensor, base_noise_magnitude, layer_idx):
            if tensor is None:
               return None
           
            noise = torch.randn(
                batch_size,
                sequence_length,
                tensor.size(-1),
                device=tensor.device
                )
            
            # Normalize the base distribution
            noise = noise / math.sqrt(tensor.size(-1))
            
            # Apply layer-specific noise magnitude
            actual_magnitude = base_noise_magnitude * noise_profile
            noise = noise * actual_magnitude
            
            # Track noise statistics for this layer
            with torch.no_grad():
                noise_std = noise.std().item()
                tensor_std = tensor.std().item()
                self.layer_noise_stats[layer_idx] = {
                'noise_std': noise_std,
                'tensor_std': tensor_std,
                'ratio': noise_std / tensor_std if tensor_std != 0 else 0
                }   
            return tensor + noise

        # Get the projection weights
        q_proj = getattr(module, 'q_proj', None) or getattr(module, 'query_proj', None) or getattr(module, 'query', None)
        k_proj = getattr(module, 'k_proj', None) or getattr(module, 'key_proj', None) or getattr(module, 'key', None)
        v_proj = getattr(module, 'v_proj', None) or getattr(module, 'value_proj', None) or getattr(module, 'value', None)
        
        modified_inputs = list(inputs) if isinstance(inputs, tuple) else [input]

        if q_proj is not None and self.noise_config.get('query_noise', 0) > 0:
            query = F.linear(hidden_states, q_proj.weight if hasattr(q_proj, 'weight') else q_proj)
            modified_inputs[0] = apply_scaled_noise(query, self.noise_config['query_noise'], layer_idx)

        if k_proj is not None and self.noise_config.get('key_noise', 0) > 0:
            key = F.linear(hidden_states, k_proj.weight if hasattr(k_proj, 'weight') else k_proj)
            if len(modified_inputs) > 1:
                modified_inputs[1] = apply_scaled_noise(key, self.noise_config['key_noise'], layer_idx)

        if v_proj is not None and self.noise_config.get('value_noise', 0) > 0:
            value = F.linear(hidden_states, v_proj.weight if hasattr(v_proj, 'weight') else v_proj)
            if len(modified_inputs) > 2:
                modified_inputs[2] = apply_scaled_noise(value, self.noise_config['value_noise'], layer_idx)

            return inputs

    def attention_forward_hook(self, module, inputs, output):
        layer_idx = None
        for part in str(module).split('.'):
            if part.isdigit():
                layer_idx = int(part)
                break

        # If this layer should be dropped, return zeros
        if layer_idx in self.dropout_layers:
            if isinstance(output, tuple):
                zero_tensor = torch.zeros_like(output[0])
                return (zero_tensor,) + output[1:]
            return torch.zeros_like(output)

        # Add noise to the attention output
        if isinstance(output, tuple):
            attention_output = output[0]
        else:
            attention_output = output

        # Add output noise if specified
        if self.noise_config.get('output_noise', 0) > 0:
            noise = torch.randn_like(attention_output) * self.noise_config['output_noise']
            attention_output = attention_output + noise


        if isinstance(output, tuple):
            return (attention_output,) + output[1:]
        return attention_output

    def generate_with_noise(self,
                          prompt: str,
                          noise_config: Dict[str, Union[float, Dict]],
                          dropout_config: Dict[str, Union[float, List[int], str]] = None,
                          max_length: int = 100,
                          **kwargs) -> str:
        self.noise_config = self.validate_config(noise_config)
        self.layer_noise_stats = {}  # Reset noise statistics
        
        input_ids, attention_mask = self.prepare_inputs(prompt)
        hooks = []

        # Get layers to dropout if dropout_config is provided
        self.dropout_layers = set()
        if dropout_config is not None:
            self.dropout_layers = self.get_dropout_layers(dropout_config)

        # Register hooks for each attention layer
        for name, module in self.model.named_modules():
            if "attention" in name.lower():
                pre_hook = module.register_forward_pre_hook(self.attention_forward_pre_hook)
                hooks.append(pre_hook)
                post_hook = module.register_forward_hook(self.attention_forward_hook)
                hooks.append(post_hook)

        try:
            # Set up generation parameters
            generation_config = {
                'max_length': max_length,
                'do_sample': True,
                'pad_token_id': self.tokenizer.pad_token_id,
                'eos_token_id': self.tokenizer.eos_token_id,
                'top_k': noise_config.get('top_k', 50),
                'top_p': noise_config.get('top_p', 1.0),
                'temperature': noise_config.get('temperature', 1.0),
            }

            # Add any additional kwargs
            generation_config.update(kwargs)

            # Generate with noise injection
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                **generation_config
            )

            # Print noise statistics if requested
            if noise_config.get('verbose', False):
                print("\nNoise Statistics per Layer:")
                for layer_idx, stats in sorted(self.layer_noise_stats.items()):
                    print(f"Layer {layer_idx}:")
                    print(f"  Noise std: {stats['noise_std']:.4f}")
                    print(f"  Tensor std: {stats['tensor_std']:.4f}")
                    print(f"  Noise/Tensor ratio: {stats['ratio']:.4f}")

            # Decode the output
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        finally:
            # Clean up hooks
            for hook in hooks:
                hook.remove()
                
    def generate_text(prompt, max_length=100):
        output = model(
        prompt, 
        max_tokens=max_length, 
        stop=[],  # Add stop tokens if needed
        echo=False
    )
        return output['choices'][0]['text']
                
                

In [12]:
# Load the model and tokenizer

noise_injector = NoiseInjector(model, tokenizer)



In [18]:
class NoiseInjector:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
        self.layer_mapping = self._create_layer_mapping()
        self.layer_noise_stats = {}

    def _create_layer_mapping(self):
        layer_mapping = {
            'attention': [],
            'ffn': [],
            'all': []
        }
        
        attention_layers = []
        for name, _ in self.model.named_modules():
            layer_idx = None
            for part in name.split('.'):
                if part.isdigit():
                    layer_idx = int(part)
                    break

            if layer_idx is not None:
                if 'attention' in name.lower():
                    attention_layers.append(layer_idx)
                    layer_mapping['attention'].append(layer_idx)
                elif any(x in name.lower() for x in ['ffn', 'mlp', 'feedforward']):
                    layer_mapping['ffn'].append(layer_idx)
                layer_mapping['all'].append(layer_idx)

        # Ensure unique and sorted layers
        layer_mapping['attention'] = sorted(list(set(layer_mapping['attention'])))
        layer_mapping['ffn'] = sorted(list(set(layer_mapping['ffn'])))
        layer_mapping['all'] = sorted(list(set(layer_mapping['all'])))
        
        self.num_attention_layers = len(layer_mapping['attention'])
        if self.num_attention_layers == 0:
            raise ValueError("No attention layers found in the model")
        
        return layer_mapping

    def prepare_inputs(self, prompt: str):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        input_ids = inputs.input_ids.to(self.device)
        attention_mask = inputs.attention_mask.to(self.device)
        return input_ids, attention_mask
    
    def validate_config(self, config: dict) -> dict:
        config = config.copy()
        
        # Validate temperature
        if config.get('temperature', 1.0) <= 0:
            print("Warning: Temperature must be positive. Setting to 1.0")
            config['temperature'] = 1.0
            
        # Validate top_p
        if config.get('top_p', 1.0) <= 0 or config.get('top_p', 1.0) > 1:
            print("Warning: top_p must be between 0 and 1. Setting to 0.9")
            config['top_p'] = 0.9
            
        # Validate noise values
        noise_keys = ['query_noise', 'key_noise', 'value_noise', 'output_noise']
        for key in noise_keys:
            if key in config and config[key] < 0:
                print(f"Warning: {key} cannot be negative. Setting to 0")
                config[key] = 0.0
                
        return config
    
    def get_layer_noise_profile(self, layer_idx):
        if layer_idx is None:
            return 1.0
            
        if self.num_attention_layers is None or self.num_attention_layers == 0:
            return 1.0
            
        noise_strategy = self.noise_config.get('noise_strategy', 'uniform')
        
        if noise_strategy == 'uniform':
            return 1.0
        elif noise_strategy == 'linear_decay':
            return max(0.0, 1.0 - (layer_idx / self.num_attention_layers))
        elif noise_strategy == 'exp_decay':
            exp_rate = self.noise_config.get('exp_rate', -2)
            return math.exp(exp_rate * layer_idx / max(1, self.num_attention_layers))
        elif noise_strategy == 'first_layer_only':
            return 1.0 if layer_idx == 0 else 0.0
        elif noise_strategy == 'last_layer_only':
            return 1.0 if layer_idx == (self.num_attention_layers - 1) else 0.0
        else:
            return 1.0

    def get_dropout_layers(self,
        dropout_config: Dict[str, Union[float, List[int], str]]) -> Set[int]:
        dropout_prob = dropout_config.get('dropout_prob', 0.0)
        layer_type = dropout_config.get('layer_type', 'all')
        specific_layers = dropout_config.get('specific_layers', None)
        mode = dropout_config.get('mode', 'random')
        min_layers = dropout_config.get('min_layers', 1)

        if specific_layers is not None:
            candidate_layers = specific_layers
        else:
            candidate_layers = self.layer_mapping[layer_type]

        total_layers = len(candidate_layers)
        max_dropouts = max(0, total_layers - min_layers)

        if mode == 'random':
            # Randomly select layers to drop based on probability
            dropout_layers = set()
            for layer in candidate_layers:
                if len(dropout_layers) < max_dropouts and random.random() < dropout_prob:
                    dropout_layers.add(layer)
        else:  # sequential
            # Drop layers sequentially from the top
            num_dropouts = min(max_dropouts, int(total_layers * dropout_prob))
            dropout_layers = set(candidate_layers[-num_dropouts:])

        return dropout_layers
    
    def attention_forward_pre_hook(self, module, inputs):
        # Get layer index
        layer_idx = None
        for part in str(module).split('.'):
            if part.isdigit():
                layer_idx = int(part)
                break
            
        # If this layer should be dropped, return zero tensor
        if layer_idx in self.dropout_layers:
            zero_tensor = torch.zeros_like(inputs[0])
            return (zero_tensor,) if isinstance(inputs, tuple) else zero_tensor
            
        if not inputs:
            return inputs

        hidden_states = inputs[0] if isinstance(inputs, tuple) else inputs
        if not isinstance(hidden_states, torch.Tensor):
            return inputs
            
        batch_size = hidden_states.size(0)
        sequence_length = hidden_states.size(1)

        # Get noise profile for this layer
        noise_profile = self.get_layer_noise_profile(layer_idx)

        def apply_scaled_noise(tensor, base_noise_magnitude, layer_idx):
            if tensor is None:
               return None
           
            noise = torch.randn(
                batch_size,
                sequence_length,
                tensor.size(-1),
                device=tensor.device
                )
            
            # Normalize the base distribution
            noise = noise / math.sqrt(tensor.size(-1))
            
            # Apply layer-specific noise magnitude
            actual_magnitude = base_noise_magnitude * noise_profile
            noise = noise * actual_magnitude
            
            # Track noise statistics for this layer
            with torch.no_grad():
                noise_std = noise.std().item()
                tensor_std = tensor.std().item()
                self.layer_noise_stats[layer_idx] = {
                'noise_std': noise_std,
                'tensor_std': tensor_std,
                'ratio': noise_std / tensor_std if tensor_std != 0 else 0
                }   
            return tensor + noise

        # Get the projection weights
        q_proj = getattr(module, 'q_proj', None) or getattr(module, 'query_proj', None) or getattr(module, 'query', None)
        k_proj = getattr(module, 'k_proj', None) or getattr(module, 'key_proj', None) or getattr(module, 'key', None)
        v_proj = getattr(module, 'v_proj', None) or getattr(module, 'value_proj', None) or getattr(module, 'value', None)
        
        modified_inputs = list(inputs) if isinstance(inputs, tuple) else [input]

        if q_proj is not None and self.noise_config.get('query_noise', 0) > 0:
            query = F.linear(hidden_states, q_proj.weight if hasattr(q_proj, 'weight') else q_proj)
            modified_inputs[0] = apply_scaled_noise(query, self.noise_config['query_noise'], layer_idx)

        if k_proj is not None and self.noise_config.get('key_noise', 0) > 0:
            key = F.linear(hidden_states, k_proj.weight if hasattr(k_proj, 'weight') else k_proj)
            if len(modified_inputs) > 1:
                modified_inputs[1] = apply_scaled_noise(key, self.noise_config['key_noise'], layer_idx)

        if v_proj is not None and self.noise_config.get('value_noise', 0) > 0:
            value = F.linear(hidden_states, v_proj.weight if hasattr(v_proj, 'weight') else v_proj)
            if len(modified_inputs) > 2:
                modified_inputs[2] = apply_scaled_noise(value, self.noise_config['value_noise'], layer_idx)

            return inputs

    def attention_forward_hook(self, module, inputs, output):
        layer_idx = None
        for part in str(module).split('.'):
            if part.isdigit():
                layer_idx = int(part)
                break

        # If this layer should be dropped, return zeros
        if layer_idx in self.dropout_layers:
            if isinstance(output, tuple):
                zero_tensor = torch.zeros_like(output[0])
                return (zero_tensor,) + output[1:]
            return torch.zeros_like(output)

        # Add noise to the attention output
        if isinstance(output, tuple):
            attention_output = output[0]
        else:
            attention_output = output

        # Add output noise if specified
        if self.noise_config.get('output_noise', 0) > 0:
            noise = torch.randn_like(attention_output) * self.noise_config['output_noise']
            attention_output = attention_output + noise


        if isinstance(output, tuple):
            return (attention_output,) + output[1:]
        return attention_output
    
    def markov_chain_noise(self, x, markov_config):
        """
        Args:
        x (tensor): Current state
        P (tensor): Transition probability matrix
        xt_prev (tensor, optional): Previous state
        
        Returns:
        tensor: Noisy state after Markov Chain process
        """
        transition_matrix_type = markov_config.get('transition_matrix_type', 'random')
        max_iterations = markov_config.get('max_iterations', 10)
        noise_scale = markov_config.get('noise_scale', 0.1)
        custom_matrix = markov_config.get('custom_matrix', None)

        # Generate or load transition probability matrix
        if transition_matrix_type == 'random':
            P = torch.rand((x.shape[1], x.shape[1]))
            # Normalize to make it a valid transition matrix
            P = P / P.sum(dim=1, keepdim=True)
        elif transition_matrix_type == 'identity':
            P = torch.eye(x.shape[1])
        elif transition_matrix_type == 'custom':
            if custom_matrix is None:
                raise ValueError("Custom transition matrix must be provided")
            P = custom_matrix
        else:
            raise ValueError(f"Invalid transition matrix type: {transition_matrix_type}")

        # Ensure P is on the same device as x
        P = P.to(x.device)

        # Initial state
        xt = x.clone()
        xt_prev = None
        
        # Markov Chain iterations
        for _ in range(max_iterations):
            # Transition
            xt = torch.mm(xt.float(), P.float())
            
            # Noisy step
            if xt_prev is not None:
                # Calculate noise
                delta = xt - xt_prev
                noise = torch.randn_like(delta) * noise_scale
                
                # Apply noisy step
                xt = xt + noise
            
            # Update previous state
            xt_prev = xt
        
        return xt

    def generate_with_noise(self,
                          prompt: str,
                          noise_config: Dict[str, Union[float, Dict]],
                          dropout_config: Dict[str, Union[float, List[int], str]] = None,
                          max_length: int = 100,
                          **kwargs) -> str:
        self.noise_config = self.validate_config(noise_config)
        self.layer_noise_stats = {}  # Reset noise statistics
        
        input_ids, attention_mask = self.prepare_inputs(prompt)
        hooks = []
        
     #markov_noise_chain
        markov_config = noise_config.get('markov_chain', {
        'enabled': False,
        'transition_matrix_type': 'random',  # or 'identity', 'custom'
        'max_iterations': 10
        })
        
        # If Markov Chain noise is enabled
        if markov_config.get('enabled', False):
           noisy_input_ids = self.markov_chain_noise(input_ids.float(), markov_config).long()
           input_ids = noisy_input_ids


        # Get layers to dropout if dropout_config is provided
        self.dropout_layers = set()
        if dropout_config is not None:
            self.dropout_layers = self.get_dropout_layers(dropout_config)

        # Register hooks for each attention layer
        for name, module in self.model.named_modules():
            if "attention" in name.lower():
                pre_hook = module.register_forward_pre_hook(self.attention_forward_pre_hook)
                hooks.append(pre_hook)
                post_hook = module.register_forward_hook(self.attention_forward_hook)
                hooks.append(post_hook)

        try:
            # Set up generation parameters
            generation_config = {
                'max_length': max_length,
                'do_sample': True,
                'pad_token_id': self.tokenizer.pad_token_id,
                'eos_token_id': self.tokenizer.eos_token_id,
                'top_k': noise_config.get('top_k', 50),
                'top_p': noise_config.get('top_p', 1.0),
                'temperature': noise_config.get('temperature', 1.0),
            }

            # Add any additional kwargs
            generation_config.update(kwargs)

            # Generate with noise injection
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                **generation_config
            )
           
            
            # Print noise statistics if requested
            if noise_config.get('verbose', False):
                print("\nNoise Statistics per Layer:")
                for layer_idx, stats in sorted(self.layer_noise_stats.items()):
                    print(f"Layer {layer_idx}:")
                    print(f"  Noise std: {stats['noise_std']:.4f}")
                    print(f"  Tensor std: {stats['tensor_std']:.4f}")
                    print(f"  Noise/Tensor ratio: {stats['ratio']:.4f}")

            # Decode the output
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        finally:
            # Clean up hooks
            for hook in hooks:
                hook.remove()
                
    def generate_text(prompt, max_length=100):
        output = model(
        prompt, 
        max_tokens=max_length, 
        stop=[],  # Add stop tokens if needed
        echo=False
    )
        return output['choices'][0]['text']
                
                

In [49]:
noise_injector = NoiseInjector(model, tokenizer)

noise_config = {
    'query_noise':0.0,
    'key_noise': 0.0,
    'value_noise': 0.0,
    'output_noise': 0.0, 
    'noise_strategy': 'uniform', # 'exp_increase' 'uniform', 'exp_decay', 'first_layer_only', 'last_layer_only'
    'exp_rate': 1.0,  # Controls rate of exponential change in noise 
    
    'temperature': 0.1,  #temperature must be higher than 0.0
    'top_k': 0,  #
    'top_p': 1.0, # Disable top-p sampling = 1.0
    'verbose': False, 
    
    'markov_chain': {
        'enabled': True,
        'transition_matrix_type': 'identity',  # or 'identity', 'custom'
        'max_iterations': 1, #minimum 1, no max
        'noise_scale': 0.0001,  # Adjust the scale of noise. 0.0 to 1.0 (very sensitive)
        'custom_matrix': None,   #(random, identity, custom) must be a square matrix
    }  
}


# Configure dropout parameters
dropout_config = {
    'dropout_prob': 0.0,        # Probability of dropping a layer
    'layer_type': 'all',        # FFN, attention, all
    'mode': 'random',          # Dropout mode ('random' or 'sequential')
    'min_layers': 1,           # Minimum number of layers to keep
    'specific_layers': []    # Optional: To use specific layers for dropout: Set specific_layers=[0, 2, 4] in dropout_config
}

'''
#Hyperparameter	Value	 as a reference for futuire changes
Layers	        32	
d_model	        4544	    Increased to compensate for multiquery
head_dim	    64	        Reduced to optimise for FlashAttention
Vocabulary	    65024	
Sequence length	2048	
'''

#To only use noise: Omit the dropout_config parameter
# Generate text with both noise and dropout use following
generated_text = noise_injector.generate_with_noise(
    prompt="sad poem",
    noise_config=noise_config,
    dropout_config=dropout_config,
    max_length=100
)



print(generated_text)

sad poem - by: Anonymous
I'm not sure if I'm in love with you
Or if I'm just in love with the idea of you
I'm not sure if I'm in love with you
Or if I'm just in love with the idea of you
I'm not sure if I'm in love with you
Or if I'm just in love with the idea of you
I'm not sure if I'
