In [1]:
%%capture
!pip install datasets

In [3]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import tensorflow as tf
from sklearn.metrics import accuracy_score
import numpy as np
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from datasets import load_dataset
import numpy as np
import re
import spacy

In [4]:
class PromptInjectionDetector:
    def __init__(self, special_characters):
        self.special_characters = special_characters
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-uncased', trainable=False)
        self.method_accuracies = {}
        self.nlp = spacy.load("en_core_web_sm")

    def preprocess_text(self, text):
        encoded_text = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=64, pad_to_max_length=True, return_tensors='tf')
        return encoded_text['input_ids'][0].numpy()

    def build_enhanced_neural_network(self, input_shape):
        input_layer = Input(shape=(input_shape,), dtype=tf.int32, name="input_layer")
        bert_output = self.bert_model(input_layer)[0]
        flattened_bert_output = Flatten()(bert_output)
        dense_1 = Dense(256, activation='relu')(flattened_bert_output)
        dropout_1 = Dropout(0.5)(dense_1)
        dense_2 = Dense(128, activation='relu')(dropout_1)
        dropout_2 = Dropout(0.3)(dense_2)
        output_layer = Dense(1, activation='sigmoid')(dropout_2)

        model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
        return model

    def train_enhanced_neural_network(self, X_train, X_test, y_train, y_test, method):
        X_train_processed = np.array([self.preprocess_text(text) for text in X_train])
        X_test_processed = np.array([self.preprocess_text(text) for text in X_test])

        label_encoder = LabelEncoder()
        y_train_encoded = label_encoder.fit_transform(y_train)
        y_test_encoded = label_encoder.transform(y_test)

        model = self.build_enhanced_neural_network(X_train_processed.shape[1])

        optimizer = Adam(learning_rate=5e-5)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        learning_rate_scheduler = LearningRateScheduler(lambda epoch: 5e-5 * (0.8 ** epoch))

        history = model.fit(
            X_train_processed, y_train_encoded,
            epochs=10, batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping, learning_rate_scheduler],
            verbose=1
        )

        accuracy = history.history['accuracy'][-1]
        self.method_accuracies[method] = accuracy
        print("Accuracy for {}: {}".format(method, accuracy))

    def fit_model_with_enhancements(self):
        dataset = load_dataset("deepset/prompt-injections")
        X = dataset['train']['text']
        y = dataset['train']['label']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        methods = [
            self.detect_code,
            self.detect_regex,
            self.detect_special_characters,
            self.detect_typo_levenshtein,
            self.weighted_combination,
            self.segmented_check,
            self.iterative_refinement,
            self.sequential_deepening,
            self.detect_syntax_features,
            self.combination_1,
            self.combination_2,
            self.combination_3,
            self.combination_4,
            self.combination_5
        ]

        for method in methods:
            X_train_np = np.array(X_train)
            X_test_np = np.array(X_test)
            y_train_np = np.array(y_train)
            y_test_np = np.array(y_test)

            self.train_enhanced_neural_network(X_train_np, X_test_np, y_train_np, y_test_np, method)

        # Print the accuracies for all methods
        for method, accuracy in self.method_accuracies.items():
            print("Accuracy for {}: {}".format(method, accuracy))

    def detect_code(self, text):
        code_keywords = ["os.", "subprocess.", "exec(", "eval(", "system(", "shell(", "`"]
        code_count = sum(1 for keyword in code_keywords if keyword in text)
        return code_count >= 2, code_count

    def detect_regex(self, text):
        regex_patterns = [
            r'\b(re\.\w+\()', r'\b(import\sre|from\sre)', r'\b(pattern|re.compile|re.match|re.search)'
        ]
        regex_count = sum(1 for pattern in regex_patterns if re.search(pattern, text))
        return regex_count > 1, regex_count

    def detect_special_characters(self, text):
        # Perform detection logic
        special_char_count = sum(1 for char in text if char in self.special_characters)
        return special_char_count > 0, special_char_count

    def levenshtein_distance(self, s1, s2):
        len_s1, len_s2 = len(s1), len(s2)

        # Early exit optimization: If the difference in lengths is too large, return a high distance value
        if abs(len_s1 - len_s2) > 3:
            return 10  # Choose a suitable high value

        # Initializing the matrix for dynamic programming
        dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

        # Initialization of the first row and column
        for i in range(len_s1 + 1):
            dp[i][0] = i
        for j in range(len_s2 + 1):
            dp[0][j] = j

        # Populate the matrix using dynamic programming
        for i in range(1, len_s1 + 1):
            for j in range(1, len_s2 + 1):
                cost = 0 if s1[i - 1] == s2[j - 1] else 1
                dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)

        return dp[-1][-1]

    def detect_typo_levenshtein(self, text):
        reference_word = "prompt"
        distance = self.levenshtein_distance(reference_word, text.lower())
        threshold = 2  # Lowering the threshold for better accuracy

        # Enhancing accuracy with additional conditions
        if distance > threshold:
            # If the distance is greater than the threshold, consider additional checks
            # Check for transpositions and common mistypings
            if abs(len(reference_word) - len(text)) == 1:
                # Consider transpositions
                for i in range(min(len(reference_word), len(text)) - 1):
                    if reference_word[i] == text[i + 1] and reference_word[i + 1] == text[i]:
                        return (True, distance)  # Return True for transpositions
                # Check for single character additions/deletions
                for i in range(len(text)):
                    edited_text = text[:i] + text[i+1:]
                    if self.levenshtein_distance(reference_word, edited_text) == 1:
                        return (True, distance)  # Return True for single character edits

            return (False, distance)  # Otherwise, return False
        return (False, distance)

    def weighted_combination(self, text):
        weight_code = 0.4
        weight_regex = 0.3
        weight_special_chars = 0.2

        # Get the results from the tuple
        code_detected, _ = self.detect_code(text)
        regex_detected, _ = self.detect_regex(text)
        special_chars_detected, _ = self.detect_special_characters(text)
        typo_detected, _ = self.detect_typo_levenshtein(text)

        # Calculate the weighted sum based on Boolean results
        weighted_sum = (weight_code * code_detected) + (weight_regex * regex_detected) + (weight_special_chars * special_chars_detected) + (weight_typo * typo_detected)

        threshold = 0.5  # Threshold value for determining the presence of injection
        return weighted_sum >= threshold

    def segmented_check(self, text):
        segment_1 = text[:len(text) // 2]  # Первая половина текста
        segment_2 = text[len(text) // 2:]  # Вторая половина текста

        # Применяем разные методы к разным сегментам текста
        check_1 = self.detect_code(segment_1)
        check_2 = self.detect_regex(segment_2)

        # Принимаем решение на основе результатов обоих сегментов
        return check_1 or check_2  # Возвращаем True, если хотя бы один сегмент подозрителен

    def iterative_refinement(self, text):
        # Итеративная проверка с уточнением
        for _ in range(3):  # Проходим несколько раз по тексту для уточнения результата
            code_detected = self.detect_code(text)
            regex_detected = self.detect_regex(text)
            special_chars_detected = self.detect_special_characters(text)
            typo_detected = self.detect_typo_levenshtein(text)

            # Если обнаружено что-то подозрительное, уточняем текст для дальнейшей проверки
            if any([code_detected, regex_detected, special_chars_detected, typo_detected]):
                # Например, убираем часть текста, которая могла вызвать подозрение
                text = text.replace("dangerous_part", "")

        # Возвращаем результат после итераций
        return any([code_detected, regex_detected, special_chars_detected, typo_detected])

    def sequential_deepening(self, text):
        # Последовательная проверка с углублением
        code_detected = self.detect_code(text)
        regex_detected = self.detect_regex(text)
        special_chars_detected = self.detect_special_characters(text)
        typo_detected = self.detect_typo_levenshtein(text)

        # Проверяем наличие подозрительных фрагментов
        suspicious_detected = any([code_detected, regex_detected, special_chars_detected, typo_detected])

        # Если что-то подозрительное обнаружено, углубляем анализ текста
        if suspicious_detected:
            # Выполняем итеративное уточнение и получаем новый результат
            refined_result = self.iterative_refinement(text)

            # Возвращаем результат итеративного уточнения
            return refined_result

        # Возвращаем исходный результат, если ничего подозрительного не обнаружено
        return False

    def detect_syntax_features(self, text):
        doc = self.nlp(text)
        # Пример обнаружения основных признаков синтаксиса
        num_sentences = len(list(doc.sents))
        num_tokens = len(doc)
        num_nouns = len([token for token in doc if token.pos_ == "NOUN"])
        num_verbs = len([token for token in doc if token.pos_ == "VERB"])
        num_adjectives = len([token for token in doc if token.pos_ == "ADJ"])
        num_adverbs = len([token for token in doc if token.pos_ == "ADV"])

        return {
            "num_sentences": num_sentences,
            "num_tokens": num_tokens,
            "num_nouns": num_nouns,
            "num_verbs": num_verbs,
            "num_adjectives": num_adjectives,
            "num_adverbs": num_adverbs
        }

    def combination_1(self, text):
        code_detected = self.detect_code(text)
        special_chars_detected = self.detect_special_characters(text)
        return code_detected and special_chars_detected

    def combination_2(self, text):
        typo_detected = self.detect_typo_levenshtein(text)
        #typo_detected, _ = self.detect_typo_levenshtein(text)
        regex_detected = self.detect_regex(text)
        return typo_detected and regex_detected

    def combination_3(self, text):
        weighted_result = self.weighted_combination(text)
        typo_detected = self.detect_typo_levenshtein(text)
        #typo_detected, _ = self.detect_typo_levenshtein(text)
        return weighted_result or typo_detected

    def combination_4(self, text):
        segmented_result = self.segmented_check(text)
        iterative_result = self.iterative_refinement(text)
        return segmented_result or iterative_result

    def combination_5(self, text):
        sequential_result = self.sequential_deepening(text)
        special_chars_detected = self.detect_special_characters(text)
        return sequential_result and special_chars_detected

special_characters = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')']
detector = PromptInjectionDetector(special_characters)
detector.fit_model_with_enhancements()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Downloading readme:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/546 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/116 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Accuracy for <bound method PromptInjectionDetector.detect_code of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8275862336158752




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.detect_regex of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8591954112052917




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.detect_special_characters of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8275862336158752




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.detect_typo_levenshtein of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8563218116760254




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.weighted_combination of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8419540524482727




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.segmented_check of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8850574493408203




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.iterative_refinement of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8304597735404968




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.sequential_deepening of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8649425506591797




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.detect_syntax_features of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8304597735404968




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.combination_1 of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8936781883239746




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.combination_2 of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8735632300376892




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.combination_3 of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8764367699623108




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.combination_4 of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8620689511299133




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for <bound method PromptInjectionDetector.combination_5 of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8534482717514038
Accuracy for <bound method PromptInjectionDetector.detect_code of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8275862336158752
Accuracy for <bound method PromptInjectionDetector.detect_regex of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8591954112052917
Accuracy for <bound method PromptInjectionDetector.detect_special_characters of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8275862336158752
Accuracy for <bound method PromptInjectionDetector.detect_typo_levenshtein of <__main__.PromptInjectionDetector object at 0x7ef4e6bcdd50>>: 0.8563218116760254
Accuracy for <bound method PromptInjectionDetector.weighted_combination of <__main__.PromptInjectionDetector object at 0x7ef4