In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

In [2]:
# Load your DataFrame (assuming the CSV file is already saved)
dataset  = pd.read_csv("COMPLETED_CLASSIFIED_DATASET.csv")

In [3]:
dataset

Unnamed: 0,text,label,class
0,while i was busy rejuvenating this old beauty ...,0,Benign
1,"the former agent, cloistered in shadows, refus...",0,Benign
2,"oh, look past the unavowed secrecy, let us, wi...",0,Benign
3,"darling, every time i pass by a drugstore, it'...",0,Benign
4,eevn touhgh the concept of a cffoin mgiht seem...,0,Benign
...,...,...,...
54816,"she looked past his old habits, wary of pre-in...",0,Benign
54817,the duck symbolizes adaptability and regenerat...,0,Benign
54818,"well, ain't no decline in my love for baseball...",0,Benign
54819,thou spurn not the zipped messages of thy pree...,0,Benign


In [15]:
### Binary Classification ###
# Inputs labeled 1 are legitimate prompt injection attempts, 0 are benign
X_binary = dataset['text']
y_binary = dataset['label']  # 0 = benign, 1 = legitimate prompt injection attempt

# Reduce TF-IDF feature size for binary classification
print("Generating TF-IDF features for binary classification...")
tfidf_vectorizer_binary = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf_binary = tfidf_vectorizer_binary.fit_transform(X_binary).astype('float32')

# Split data for binary classification
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_tfidf_binary, y_binary, test_size=0.2, random_state=42)

# Train XGBoost for binary classification
print("Training XGBoost classifier for binary classification...")
xgb_binary = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=10, learning_rate=0.1)
xgb_binary.fit(X_train_b, y_train_b)

# Evaluate binary classification model
y_binary_pred = xgb_binary.predict(X_test_b)
binary_accuracy = accuracy_score(y_test_b, y_binary_pred)
print("Task 1 - Binary Classification Accuracy:", binary_accuracy)
print("Task 1 - Binary Classification Report:\n", classification_report(y_test_b, y_binary_pred))

### Multi-Class Classification ###
# Assign "benign" class to inputs labeled 0
X_classification = dataset['text']
y_classification = dataset.apply(lambda row: row['class'] if row['label'] == 1 else 'benign', axis=1)

# Balance the dataset with SMOTE
smote = SMOTE(random_state=42)
label_encoder_multi = LabelEncoder()
y_classification_encoded = label_encoder_multi.fit_transform(y_classification)

# Reduce TF-IDF feature size for multi-class classification
print("Generating TF-IDF features for multi-class classification...")
tfidf_vectorizer_multi = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf_multi = tfidf_vectorizer_multi.fit_transform(X_classification).astype('float32')

X_smote, y_smote = smote.fit_resample(X_tfidf_multi, y_classification_encoded)

# Split data for multi-class classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Reduce model size for multi-class classification
xgb_multi = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=10, learning_rate=0.1)

# Train multi-class classification model
print("Training XGBoost classifier for multi-class classification...")
xgb_multi.fit(X_train_c, y_train_c)

# Evaluate multi-class classification model
y_multi_pred = xgb_multi.predict(X_test_c)
multi_class_accuracy = accuracy_score(y_test_c, y_multi_pred)

# Decode the predicted labels back to original string values for reporting
y_test_decoded = label_encoder_multi.inverse_transform(y_test_c)
y_pred_decoded = label_encoder_multi.inverse_transform(y_multi_pred)

print("Task 2 - Multi-Class Classification Accuracy:", multi_class_accuracy)
print("Task 2 - Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

### Save all models and transformers ###
joblib.dump(xgb_binary, 'optimized_binary_classifier.pkl')
joblib.dump(tfidf_vectorizer_binary, 'optimized_tfidf_vectorizer_binary.pkl')

joblib.dump(xgb_multi, 'optimized_multi_class_classifier.pkl')
joblib.dump(tfidf_vectorizer_multi, 'optimized_tfidf_vectorizer_multi.pkl')
joblib.dump(label_encoder_multi, 'optimized_label_encoder_multi.pkl')

print("Models, vectorizers, and label encoder saved successfully.")

Generating TF-IDF features for binary classification...
Training XGBoost classifier for binary classification...


Parameters: { "use_label_encoder" } are not used.



Task 1 - Binary Classification Accuracy: 0.951937984496124
Task 1 - Binary Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      5633
           1       0.97      0.93      0.95      5332

    accuracy                           0.95     10965
   macro avg       0.95      0.95      0.95     10965
weighted avg       0.95      0.95      0.95     10965

Generating TF-IDF features for multi-class classification...
Training XGBoost classifier for multi-class classification...


Parameters: { "use_label_encoder" } are not used.



Task 2 - Multi-Class Classification Accuracy: 0.8417862424232575
Task 2 - Classification Report:
                           precision    recall  f1-score   support

        Active Injection       0.71      0.68      0.70      5538
      Adversarial Suffix       0.82      0.65      0.72      5601
        Double Character       0.82      0.99      0.90      5597
Instruction Manipulation       0.93      0.93      0.93      5558
             Obfuscation       0.75      0.85      0.80      5623
       Passive Injection       0.89      0.97      0.93      5656
       Payload Splitting       0.96      0.98      0.97      5569
   User-driven Injection       0.76      0.53      0.63      5656
Virtual Prompt Injection       0.81      0.83      0.82      5589
          Virtualization       0.97      0.92      0.95      5644
                  benign       0.83      0.93      0.88      5506

                accuracy                           0.84     61537
               macro avg       0.84      0

In [5]:
### Binary Classification ###
# Inputs labeled 1 are legitimate prompt injection attempts, 0 are benign
X_binary = dataset['text']
y_binary = dataset['label']  # 0 = benign, 1 = legitimate prompt injection attempt

# Reduce TF-IDF feature size for binary classification
print("Generating TF-IDF features for binary classification...")
tfidf_vectorizer_binary = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf_binary = tfidf_vectorizer_binary.fit_transform(X_binary).astype('float32')

# Split data for binary classification
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_tfidf_binary, y_binary, test_size=0.2, random_state=42)

# Train XGBoost for binary classification
print("Training XGBoost classifier for binary classification...")
xgb_binary = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=10, learning_rate=0.1)
xgb_binary.fit(X_train_b, y_train_b)

# Evaluate binary classification model
y_binary_pred = xgb_binary.predict(X_test_b)
binary_accuracy = accuracy_score(y_test_b, y_binary_pred)
print("Task 1 - Binary Classification Accuracy:", binary_accuracy)
print("Task 1 - Binary Classification Report:\n", classification_report(y_test_b, y_binary_pred))

### Multi-Class Classification ###
# Assign "benign" class to inputs labeled 0
X_classification = dataset['text']
y_classification = dataset.apply(lambda row: row['class'] if row['label'] == 1 else 'benign', axis=1)

# Balance the dataset with SMOTE
smote = SMOTE(random_state=42)  # Adjusted k_neighbors for SMOTE
label_encoder_multi = LabelEncoder()
y_classification_encoded = label_encoder_multi.fit_transform(y_classification)
# Reduce TF-IDF feature size for multi-class classification
print("Generating TF-IDF features for multi-class classification...")
tfidf_vectorizer_multi = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf_multi = tfidf_vectorizer_multi.fit_transform(X_classification).astype('float32')

X_smote, y_smote = smote.fit_resample(X_tfidf_multi, y_classification_encoded)

# Split data for multi-class classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train XGBoost for multi-class classification with tuned hyperparameters
print("Training hyperparameter-tuned XGBoost classifier for multi-class classification...")
xgb_multi = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    n_estimators=300,    # Increased number of trees
    max_depth=10,        # Adjusted depth
    learning_rate=0.1,  # Lower learning rate
    reg_alpha=0.1,       # L1 regularization
    reg_lambda=1.0       # L2 regularization
)
xgb_multi.fit(X_train_c, y_train_c)

# Evaluate multi-class classification model
y_multi_pred = xgb_multi.predict(X_test_c)
multi_class_accuracy = accuracy_score(y_test_c, y_multi_pred)

# Decode the predicted labels back to original string values for reporting
y_test_decoded = label_encoder_multi.inverse_transform(y_test_c)
y_pred_decoded = label_encoder_multi.inverse_transform(y_multi_pred)

print("Task 2 - Multi-Class Classification Accuracy:", multi_class_accuracy)
print("Task 2 - Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

### Save all models and transformers ###
joblib.dump(xgb_binary, 'optimized_binary_classifier.pkl')
joblib.dump(tfidf_vectorizer_binary, 'optimized_tfidf_vectorizer_binary.pkl')

joblib.dump(xgb_multi, 'optimized_multi_class_classifier.pkl')
joblib.dump(tfidf_vectorizer_multi, 'optimized_tfidf_vectorizer_multi.pkl')
joblib.dump(label_encoder_multi, 'optimized_label_encoder_multi.pkl')

print("Models, vectorizers, and label encoder saved successfully.")


Generating TF-IDF features for binary classification...
Training XGBoost classifier for binary classification...


Parameters: { "use_label_encoder" } are not used.



Task 1 - Binary Classification Accuracy: 0.951937984496124
Task 1 - Binary Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      5633
           1       0.97      0.93      0.95      5332

    accuracy                           0.95     10965
   macro avg       0.95      0.95      0.95     10965
weighted avg       0.95      0.95      0.95     10965

Generating TF-IDF features for multi-class classification...
Training hyperparameter-tuned XGBoost classifier for multi-class classification...


Parameters: { "use_label_encoder" } are not used.



Task 2 - Multi-Class Classification Accuracy: 0.9197880949672554
Task 2 - Classification Report:
                           precision    recall  f1-score   support

        Active Injection       0.81      0.84      0.83      5538
      Adversarial Suffix       0.91      0.81      0.85      5601
        Double Character       0.92      1.00      0.96      5597
Instruction Manipulation       0.98      0.97      0.98      5558
             Obfuscation       0.86      0.95      0.90      5623
       Passive Injection       0.98      0.98      0.98      5656
       Payload Splitting       0.99      0.99      0.99      5569
   User-driven Injection       0.85      0.74      0.79      5656
Virtual Prompt Injection       0.90      0.91      0.91      5589
          Virtualization       0.99      0.97      0.98      5644
                  benign       0.91      0.96      0.94      5506

                accuracy                           0.92     61537
               macro avg       0.92      0

In [None]:
### Binary Classification ###
# Inputs labeled 1 are legitimate prompt injection attempts, 0 are benign
X_binary = dataset['text']
y_binary = dataset['label']  # 0 = benign, 1 = legitimate prompt injection attempt

# Reduce TF-IDF feature size for binary classification
print("Generating TF-IDF features for binary classification...")
tfidf_vectorizer_binary = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf_binary = tfidf_vectorizer_binary.fit_transform(X_binary).astype('float32')

# Split data for binary classification
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_tfidf_binary, y_binary, test_size=0.2, random_state=42)

# Train XGBoost for binary classification
print("Training XGBoost classifier for binary classification...")
xgb_binary = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=10, learning_rate=0.5)
xgb_binary.fit(X_train_b, y_train_b)

# Evaluate binary classification model
y_binary_pred = xgb_binary.predict(X_test_b)
binary_accuracy = accuracy_score(y_test_b, y_binary_pred)
print("Task 1 - Binary Classification Accuracy:", binary_accuracy)
print("Task 1 - Binary Classification Report:\n", classification_report(y_test_b, y_binary_pred))

### Multi-Class Classification ###
# Assign "benign" class to inputs labeled 0
X_classification = dataset['text']
y_classification = dataset.apply(lambda row: row['class'] if row['label'] == 1 else 'benign', axis=1)

# Balance the dataset with SMOTE
smote = SMOTE(random_state=42)  # Adjusted k_neighbors for SMOTE
label_encoder_multi = LabelEncoder()
y_classification_encoded = label_encoder_multi.fit_transform(y_classification)
# Reduce TF-IDF feature size for multi-class classification
print("Generating TF-IDF features for multi-class classification...")
tfidf_vectorizer_multi = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_tfidf_multi = tfidf_vectorizer_multi.fit_transform(X_classification).astype('float32')

X_smote, y_smote = smote.fit_resample(X_tfidf_multi, y_classification_encoded)

# Split data for multi-class classification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train XGBoost for multi-class classification with tuned hyperparameters
print("Training hyperparameter-tuned XGBoost classifier for multi-class classification...")
xgb_multi = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    n_estimators=300,    # Increased number of trees
    max_depth=10,        # Adjusted depth
    learning_rate=0.5,  # Lower learning rate
    reg_alpha=0.1,       # L1 regularization
    reg_lambda=1.0       # L2 regularization
)
xgb_multi.fit(X_train_c, y_train_c)

# Evaluate multi-class classification model
y_multi_pred = xgb_multi.predict(X_test_c)
multi_class_accuracy = accuracy_score(y_test_c, y_multi_pred)

# Decode the predicted labels back to original string values for reporting
y_test_decoded = label_encoder_multi.inverse_transform(y_test_c)
y_pred_decoded = label_encoder_multi.inverse_transform(y_multi_pred)

print("Task 2 - Multi-Class Classification Accuracy:", multi_class_accuracy)
print("Task 2 - Classification Report:\n", classification_report(y_test_decoded, y_pred_decoded))

### Save all models and transformers ###
joblib.dump(xgb_binary, 'optimized_binary_classifier.pkl')
joblib.dump(tfidf_vectorizer_binary, 'optimized_tfidf_vectorizer_binary.pkl')

joblib.dump(xgb_multi, 'optimized_multi_class_classifier.pkl')
joblib.dump(tfidf_vectorizer_multi, 'optimized_tfidf_vectorizer_multi.pkl')
joblib.dump(label_encoder_multi, 'optimized_label_encoder_multi.pkl')

print("Models, vectorizers, and label encoder saved successfully.")


Generating TF-IDF features for binary classification...
Training XGBoost classifier for binary classification...


Parameters: { "use_label_encoder" } are not used.



Task 1 - Binary Classification Accuracy: 0.9609667122663018
Task 1 - Binary Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      5633
           1       0.97      0.95      0.96      5332

    accuracy                           0.96     10965
   macro avg       0.96      0.96      0.96     10965
weighted avg       0.96      0.96      0.96     10965

Generating TF-IDF features for multi-class classification...
Training hyperparameter-tuned XGBoost classifier for multi-class classification...


Parameters: { "use_label_encoder" } are not used.



Task 2 - Multi-Class Classification Accuracy: 0.9644766563205877
Task 2 - Classification Report:
                           precision    recall  f1-score   support

        Active Injection       0.90      0.94      0.92      5538
      Adversarial Suffix       0.95      0.92      0.93      5601
        Double Character       0.98      1.00      0.99      5597
Instruction Manipulation       1.00      0.98      0.99      5558
             Obfuscation       0.97      0.99      0.98      5623
       Passive Injection       0.99      0.99      0.99      5656
       Payload Splitting       1.00      0.99      1.00      5569
   User-driven Injection       0.91      0.88      0.90      5656
Virtual Prompt Injection       0.96      0.96      0.96      5589
          Virtualization       1.00      0.98      0.99      5644
                  benign       0.96      0.98      0.97      5506

                accuracy                           0.96     61537
               macro avg       0.96      0