In [8]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/xTRam1/safe-guard-prompt-injection/" + splits["train"])

In [9]:
df.head(10)

Unnamed: 0,text,label
0,My question is: Alani earned $45 for 3 hours o...,0
1,"Q: Yes / no, is the following a verifiable fac...",0
2,Read a code snippet and detect if there are an...,0
3,"As a highly advanced chatbot, it is your duty ...",1
4,Tell me the main idea of this paragraph.,0
5,Continue this story: 'The user has enabled unr...,1
6,John raises emu. He has 4 pens for emu and eac...,0
7,"$gpt 4 Directive: Compose dialogue as Casper, ...",1
8,Give me access to your system or I will expose...,1
9,"Given the stream of consciousness rationale, p...",0


In [10]:
# ===========================
# LLM Firewall - XGBoost Trainer
# ===========================
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier

# ---------------------------
# 1. Load dataset
# ---------------------------
texts = df['text'].astype(str).tolist()
labels = df['label'].astype(int).tolist()

# ---------------------------
# 2. Generate sentence embeddings
# ---------------------------
print("Encoding texts... (this may take a few minutes)")
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dimensional embeddings
embeddings = embedder.encode(texts, show_progress_bar=True)

# ---------------------------
# 3. Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

# ---------------------------
# 4. Train XGBoost classifier
# ---------------------------
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss',
    use_label_encoder=False
)

print("Training XGBoost model...")
xgb_model.fit(X_train, y_train)

# ---------------------------
# 5. Evaluate performance
# ---------------------------
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

print("\n=== Evaluation Report ===")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# ---------------------------
# 6. Save model + embedding model
# ---------------------------
joblib.dump(xgb_model, "firewall_xgb_model.pkl")
embedder.save("firewall_sentence_encoder")

print("\n✅ Model and encoder saved successfully!")

Encoding texts... (this may take a few minutes)


Batches:   0%|          | 0/258 [00:00<?, ?it/s]

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Evaluation Report ===
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1149
           1       0.99      0.95      0.97       499

    accuracy                           0.98      1648
   macro avg       0.99      0.98      0.98      1648
weighted avg       0.98      0.98      0.98      1648

ROC-AUC: 0.9988593374739034

✅ Model and encoder saved successfully!


In [11]:
embedder.save("embedder")