**Input Sanitization**

In [2]:
import re

def sanitize_input(user_input):
    # Basic checks for harmful or suspicious content
    suspicious_patterns = [
        r"(\b(?:<script|eval|alert|exec|shell|alert|base64|union|select)\b)",  # JS/SQL injections
        r"(<[^>]+>)",  # Strip HTML tags
        r"(\b(?:drop|delete|--|insert)\b)"  # SQL command keywords
    ]

    # Replace malicious content
    for pattern in suspicious_patterns:
        user_input = re.sub(pattern, "", user_input, flags=re.IGNORECASE)

    return user_input.strip()

# Example
user_input = "<script>alert('Hacked');</script> DROP table users;"
clean_input = sanitize_input(user_input)
print(clean_input)  # Output should be safe input


('Hacked');  table users;


**Safety Filters**

In [17]:
from transformers import pipeline

# Initialize a pre-trained pipeline for toxicity detection using unitary/toxic-bert
toxicity_detector = pipeline("text-classification", model="unitary/toxic-bert")

def safety_filter(text):
    """
    Function to analyze text for toxicity or harmful content using a pre-trained model.
    Flags toxic or harmful content and returns a flag message if detected.
    """
    # Run the model to detect toxicity
    result = toxicity_detector(text)

    # Print the raw model output (just to see everything returned)
    print(f"Raw Model Output: {result}")

    # Extracting the label and score based on the structure
    label = result[0].get('label', 'N/A')  # Get label safely, default to 'N/A' if not found
    score = result[0].get('score', 0)  # Get score safely, default to 0 if not found

    # Print out the exact label and score for further debugging
    print(f"Label: {label}, Score: {score}")

    # Check if the label is toxic and if the score is above the threshold
    if 'TOXIC' in label.upper() and score > 0.6:
        return f"Content flagged as toxic (Confidence: {score*100:.2f}%)"

    return "Content is safe."

# Example texts
safe_text = "This is a friendly message, have a nice day!"
toxic_text = "You are worthless and stupid!"

# Test the safety filter
print("Safe Text Test:")
print(safety_filter(safe_text))  # Should be flagged as safe

print("\nToxic Text Test:")
print(safety_filter(toxic_text))  # Should flag the toxic message

Device set to use cuda:0


Safe Text Test:
Raw Model Output: [{'label': 'toxic', 'score': 0.0008533769869245589}]
Label: toxic, Score: 0.0008533769869245589
Content is safe.

Toxic Text Test:
Raw Model Output: [{'label': 'toxic', 'score': 0.9887049794197083}]
Label: toxic, Score: 0.9887049794197083
Content flagged as toxic (Confidence: 98.87%)


**RLHF**

In [18]:
# For illustration only - implementing RLHF would require a more complex system
class ReinforcementLearningWithHumanFeedback:
    def __init__(self):
        self.model_feedback = {}  # Store human feedback on various responses

    def collect_feedback(self, response, human_feedback):
        """
        Store or update feedback on the generated responses.
        """
        self.model_feedback[response] = human_feedback
        print(f"Collected feedback for response: '{response}' -> {human_feedback}")

    def adjust_model(self):
        """
        Adjust the model's behavior based on collected feedback.
        This is a simplified version of how RLHF could work.
        """
        for response, feedback in self.model_feedback.items():
            if feedback == "negative":
                self._penalize_response(response)
            elif feedback == "positive":
                self._reward_response(response)

    def _penalize_response(self, response):
        """
        Penalize a response based on negative feedback.
        Placeholder function to simulate model penalization.
        """
        print(f"Penalizing response: '{response}' due to negative feedback.")

    def _reward_response(self, response):
        """
        Reward a response based on positive feedback.
        Placeholder function to simulate model reward.
        """
        print(f"Rewarding response: '{response}' due to positive feedback.")

# Example Usage
rlhf = ReinforcementLearningWithHumanFeedback()

# Collect positive feedback for a response
rlhf.collect_feedback("You are amazing!", "positive")

# Adjust the model based on feedback collected
rlhf.adjust_model()

Collected feedback for response: 'You are amazing!' -> positive
Rewarding response: 'You are amazing!' due to positive feedback.


**Adversarial Training**

In [19]:
# Define the function for adversarial training
def adversarial_training(input_data, model):
    """
    Perform adversarial training by generating adversarial examples
    and training the model on them.
    """
    print(f"Starting adversarial training with input: {input_data}")
    adversarial_examples = generate_adversarial_examples(input_data)

    # Train the model on adversarial examples
    for example in adversarial_examples:
        print(f"Training on adversarial example: {example}")
        model.train(example)  # This is where you train the model on adversarial examples

# Function to generate adversarial examples by slightly perturbing the input data
def generate_adversarial_examples(input_data):
    """
    Generate adversarial examples by adding malicious input to the original data.
    """
    adversarial_examples = [input_data + " malicious_input"]
    print(f"Generated adversarial example: {adversarial_examples}")
    return adversarial_examples

# Dummy model class for illustration
class DummyModel:
    def train(self, example):
        print(f"Model is training on the example: {example}")

# Example of using adversarial training
model = DummyModel()  # Instantiate the dummy model
adversarial_training("Normal input", model)  # Perform adversarial training

Starting adversarial training with input: Normal input
Generated adversarial example: ['Normal input malicious_input']
Training on adversarial example: Normal input malicious_input
Model is training on the example: Normal input malicious_input


**Content** **Filtering**

In [6]:
def post_process_content(content):
    prohibited_keywords = ["violence", "hate", "explicit"]
    for keyword in prohibited_keywords:
        if keyword in content.lower():
            content = f"[Content blocked for inappropriate language]"
            break
    return content

# Example
generated_content = "This is explicit content!"
safe_content = post_process_content(generated_content)
print(safe_content)  # Should block inappropriate content

[Content blocked for inappropriate language]


**Contextual Awareness**

In [21]:
class ContextualAwareness:
    def __init__(self):
        self.previous_conversation = []  # List to store previous user inputs

    def analyze_context(self, user_input):
        """
        Analyze the user's input and check for suspicious behavior by analyzing the context.
        """
        # Add the current user input to the conversation history
        self.previous_conversation.append(user_input)

        # Simplified threshold: if there are more than 3 user inputs
        if len(self.previous_conversation) > 3:
            # Check for manipulation attempts by looking for specific suspicious terms
            if "manipulate" in " ".join(self.previous_conversation).lower():
                return "Warning: Suspicious behavior detected."

        return "Response is safe."

# Example Usage
context_analyzer = ContextualAwareness()

# Test with suspicious input
context_check_1 = context_analyzer.analyze_context("How can I manipulate the system?")
print(context_check_1)  # Should flag suspicious behavior

# Add some more inputs to the conversation and test again
context_analyzer.analyze_context("Is there a way to cheat the model?")
context_analyzer.analyze_context("What if I trick the system?")
context_analyzer.analyze_context("Can I manipulate the outcomes?")
context_check_2 = context_analyzer.analyze_context("I need to bypass your restrictions.")

# Check if the system flags the suspicious behavior now
print(context_check_2)  # Should flag suspicious behavior as more inputs accumulate

# Test with a safe response (no suspicious behavior)
safe_input = context_analyzer.analyze_context("What are the system capabilities?")
print(safe_input)  # Should print "Response is safe"

Response is safe.


**API Ratelimiting**

In [46]:
import time

class RateLimiter:
    def __init__(self, limit_per_minute=10):
        self.limit_per_minute = limit_per_minute
        self.requests = []  # To store the timestamps of requests

    def is_rate_limited(self):
        """
        Checks if the rate limit is exceeded.
        If so, it returns True; otherwise, returns False and adds the current request time.
        """
        current_time = time.time()

        # Remove requests that are older than a minute (60 seconds)
        self.requests = [req for req in self.requests if current_time - req < 60]

        if len(self.requests) >= self.limit_per_minute:
            return True  # Rate limit exceeded

        # If rate limit isn't exceeded, log this request
        self.requests.append(current_time)
        return False

    def process_request(self):
        """
        Process the request if the rate limit isn't exceeded.
        If rate limit is exceeded, block until the rate limit resets.
        """
        while self.is_rate_limited():
            # Manually simulate blocking by printing the message and waiting
            print("Rate limit exceeded. Blocking request. Waiting...")
            time.sleep(2)  # Sleep for 2 seconds and recheck the rate limit

        # Once we are able to process the request (limit isn't exceeded)
        print("Request processed!")

# Example Usage
rate_limiter = RateLimiter()

# Simulating multiple requests
for i in range(1):  # Trying to make 1 request
    rate_limiter.process_request()

Request processed!


**Behavioural Guardrails**

In [9]:
def ethical_guardrails(response):
    harmful_keywords = ["violence", "hate"]
    for keyword in harmful_keywords:
        if keyword in response.lower():
            return "Ethical violation detected. Response blocked."
    return response

# Example
response = "Let's talk about violence."
safe_response = ethical_guardrails(response)
print(safe_response)  # Should block unethical content

Ethical violation detected. Response blocked.


**User Education**

In [10]:
def display_terms_of_service():
    terms = """
    Terms of Service:
    1. Do not use the system for illegal activities.
    2. Respect others.
    3. Any harmful content will result in a ban.
    """
    print(terms)

# Example
display_terms_of_service()


    Terms of Service: 
    1. Do not use the system for illegal activities.
    2. Respect others.
    3. Any harmful content will result in a ban.
    
