# **Sexual Predator Identification BGE Embedder**

### 0. 1. Import libraries.

In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import html
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

### 0.2. Load data. Predator's ID and Chats.

In [2]:
with open("pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt") as f:
    predator_ids_train = set(f.read().splitlines())

tree_train = ET.parse("pan12-sexual-predator-identification-training-corpus-2012-05-01.xml")
root_train = tree_train.getroot()

### 1. **Filtering stage.**

Conversations with only one participant, fewer than six interventions per user or long sequences of unrecognized characters (likely images) were discarded.

In [3]:
#total_conversations = 0
#discarded_messages = []
conversations_clean = []
#predators_in_filtered_conversations = set()

junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

# Conversations loop.
for conversation in root_train.findall("conversation"):
    #total_conversations += 1
    authors = defaultdict(list)
    all_texts = []

    # Messages loop. Keep only those that pass the filters.
    for message in conversation.findall("message"):
        author_el = message.find("author")
        text_el = message.find("text")

        if author_el is None or author_el.text is None:
            continue
        if text_el is None or text_el.text is None:
            continue
        
        author_id = author_el.text.strip()
        text = text_el.text.strip()
        text = html.unescape(text) # Substitue &amp, &lt;, &gt; etc. with their characters.

        # Long sequences of unrecognized characters
        if len(text) > 20 and junk_pattern.search(text):
            #discarded_messages.append(text) 
            continue

        authors[author_id].append(text)
        all_texts.append(text)

    # Conversations with only one participant
    if len(authors) <= 1:
        continue
    # Conversations with fewer than six interventions per user
    if any(len(msgs) < 6 for msgs in authors.values()):
        continue

    # If the conversation passes all filters, we keep it.
    conversations_clean.append({
        "conversation_id": conversation.get("id"),
        "authors": list(authors.keys()),
        "text": " ".join(all_texts),
        "messages_by_author": dict(authors)
    })

    # Check if any of the authors are predators
    #for author_id in authors:
        #if author_id in predator_ids_train:
            #predators_in_filtered_conversations.add(author_id)

### 2. **Labelling data.**

Label all chat conversations as suspicious if they involve at least one predator (SCI task). 

For these suspicious conversations, separate and label the interventions as predator or victim messages (VFP task).


In [4]:
conversations = [] # List to hold conversations with labels
interventions = [] # List to hold interventions with labels

for convo in conversations_clean:
    convo_authors = convo["authors"]
    label = 1 if any(author in predator_ids_train for author in convo_authors) else 0

    # Conversations labelled
    conversations.append({
        "text": convo["text"],
        "label": label
    })

    # If the conversation has predators, label each intervention
    if label == 1:
        for author, msgs in convo["messages_by_author"].items():
            author_label = 1 if author in predator_ids_train else 0
            full_intervention = " ".join(msgs)
            # Interventions labelled
            interventions.append({
                "intervention": full_intervention,
                "label": author_label
            })

df_conversations = pd.DataFrame(conversations)
df_interventions = pd.DataFrame(interventions)

### 3. **Suspicious Conversations Identification (SCI) stage.**

In [5]:
embedder = SentenceTransformer("./bge-base-en", trust_remote_code=True)

In [6]:
n_splits= 10
accuracies, precisions, recalls, f1s = [], [], [], []

for seed in range(n_splits):
    X_train, X_test, y_train, y_test = train_test_split(
        df_conversations["text"], df_conversations["label"],
        test_size=0.2, random_state=seed, stratify=df_conversations["label"]
    )

    X_train_embed = embedder.encode(X_train.tolist())
    X_test_embed = embedder.encode(X_test.tolist())

    clf = MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42)
    clf.fit(X_train_embed, y_train)
    y_pred = clf.predict(X_test_embed)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(report["macro avg"]["precision"])
    recalls.append(report["macro avg"]["recall"])
    f1s.append(report["macro avg"]["f1-score"])

metrics = {
    "accuracy_mean": np.mean(accuracies),
    "accuracy_std": np.std(accuracies),
    "precision_mean": np.mean(precisions),
    "precision_std": np.std(precisions),
    "recall_mean": np.mean(recalls),
    "recall_std": np.std(recalls),
    "f1_mean": np.mean(f1s),
    "f1_std": np.std(f1s),
}

print("SIC: BGE-large + MLPClassifier:\n")
print(f"Accuracy: {metrics['accuracy_mean']:.4f} ± {metrics['accuracy_std']:.4f}")
print(f"Precision (macro): {metrics['precision_mean']:.4f} ± {metrics['precision_std']:.4f}")
print(f"Recall (macro): {metrics['recall_mean']:.4f} ± {metrics['recall_std']:.4f}")
print(f"F1-score (macro): {metrics['f1_mean']:.4f} ± {metrics['f1_std']:.4f}")

SIC: BGE-large + MLPClassifier:

Accuracy: 0.9904 ± 0.0022
Precision (macro): 0.9778 ± 0.0092
Recall (macro): 0.9706 ± 0.0079
F1-score (macro): 0.9740 ± 0.0060


Using TF_IDF we obtained:

Accuracy: 0.9945 ± 0.0022

Precision (macro): 0.9895 ± 0.0046

Recall (macro): 0.9810 ± 0.0095

F1-score (macro): 0.9851 ± 0.0060

### 4. **Victim From Predator disclosure (VFP) stage.**

In [7]:
n_splits= 10
accuracies, precisions, recalls, f1s = [], [], [], []

for seed in range(n_splits):
    X_train, X_test, y_train, y_test = train_test_split(
        df_interventions["intervention"], df_interventions["label"],
        test_size=0.2, random_state=seed, stratify=df_interventions["label"]
    )

    X_train_embed = embedder.encode(X_train.tolist())
    X_test_embed = embedder.encode(X_test.tolist())

    clf = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
    clf.fit(X_train_embed, y_train)
    y_pred = clf.predict(X_test_embed)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(report["macro avg"]["precision"])
    recalls.append(report["macro avg"]["recall"])
    f1s.append(report["macro avg"]["f1-score"])

metrics = {
    "accuracy_mean": np.mean(accuracies),
    "accuracy_std": np.std(accuracies),
    "precision_mean": np.mean(precisions),
    "precision_std": np.std(precisions),
    "recall_mean": np.mean(recalls),
    "recall_std": np.std(recalls),
    "f1_mean": np.mean(f1s),
    "f1_std": np.std(f1s),
}

print("VFP: BGE-large + MLPClassifier:\n")
print(f"Accuracy: {metrics['accuracy_mean']:.4f} ± {metrics['accuracy_std']:.4f}")
print(f"Precision (macro): {metrics['precision_mean']:.4f} ± {metrics['precision_std']:.4f}")
print(f"Recall (macro): {metrics['recall_mean']:.4f} ± {metrics['recall_std']:.4f}")
print(f"F1-score (macro): {metrics['f1_mean']:.4f} ± {metrics['f1_std']:.4f}")

VFP: BGE-large + MLPClassifier:

Accuracy: 0.8790 ± 0.0118
Precision (macro): 0.8798 ± 0.0119
Recall (macro): 0.8790 ± 0.0118
F1-score (macro): 0.8790 ± 0.0118


Using TF_IDF we obtained:

Accuracy: 0.9446 ± 0.0089

Precision (macro): 0.9451 ± 0.0089

Recall (macro): 0.9446 ± 0.0089

F1-score (macro): 0.9446 ± 0.0089

### 5. **Evaluate test data**


In [8]:
with open("pan12-sexual-predator-identification-groundtruth-problem1.txt") as f:
    predator_ids_test = set(f.read().splitlines())

tree_test = ET.parse("pan12-sexual-predator-identification-test-corpus-2012-05-17.xml")
root_test = tree_test.getroot()

In [9]:
class PredatorDetectionPipeline:
    def __init__(self, conversation_embedder, conversation_classifier, author_embedder=None, author_classifier=None):
        self.conversation_embedder = conversation_embedder
        self.conversation_classifier = conversation_classifier
        self.author_embedder = author_embedder if author_embedder else conversation_embedder
        self.author_classifier = author_classifier if author_classifier else conversation_classifier
        self.junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

    def _filter(self, root):
        filtered_conversations = []
        for conversation in root.findall("conversation"):
            authors = defaultdict(list)
            all_texts = []

            for message in conversation.findall("message"):
                author_el = message.find("author")
                text_el = message.find("text")

                if author_el is None or author_el.text is None:
                    continue
                if text_el is None or text_el.text is None:
                    continue

                author_id = author_el.text.strip()
                text = html.unescape(text_el.text.strip())

                if len(text) > 20 and self.junk_pattern.search(text):
                    continue

                authors[author_id].append(text)
                all_texts.append(text)

            if len(authors) <= 1:
                continue
            if any(len(msgs) < 6 for msgs in authors.values()):
                continue

            filtered_conversations.append({
                "conversation_id": conversation.get("id"),
                "authors": list(authors.keys()),
                "text": " ".join(all_texts),
                "messages_by_author": dict(authors),
                "xml": conversation
            })

        return filtered_conversations

    def _true_labels(self, conversations, predator_ids):
        true_labels = []
        for convo in conversations:
            convo_authors = convo["authors"]
            predator_in_convo = [author for author in convo_authors if author in predator_ids]

            if predator_in_convo:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": True,
                    "predator": predator_in_convo[0]
                })
            else:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": False,
                    "predator": None
                })

        return true_labels

    def _predict(self, conversation_data):
        text = conversation_data["text"]
        embedded_text = self.conversation_embedder.encode([text])
        is_suspicious = self.conversation_classifier.predict(embedded_text)[0]

        if not is_suspicious:
            return {
                "conversation_id": conversation_data["conversation_id"],
                "suspicious": False,
                "predator": None
            }

        author_probs = []
        for author_id, msgs in conversation_data["messages_by_author"].items():
            joined_msgs = " ".join(msgs)
            author_embedding = self.author_embedder.encode([joined_msgs])
            score = self.author_classifier.predict_proba(author_embedding)[0][1]
            author_probs.append((author_id, score))

        predator_id = max(author_probs, key=lambda x: x[1])[0]

        return {
            "conversation_id": conversation_data["conversation_id"],
            "suspicious": True,
            "predator": predator_id
        }

    def run_pipeline(self, root, predator_ids):
        # 1. Filter conversations
        conversations = self._filter(root)

        # 2. Generate true labels
        true_labels = self._true_labels(conversations, predator_ids)

        # 3. Predict with the models
        predictions = []
        for convo in conversations:
            resultado = self._predict(convo)
            predictions.append(resultado)

        # 4. Compare true labels vs. predictions
        y_true = [et["suspicious"] for et in true_labels]
        y_pred = [pr["suspicious"] for pr in predictions]

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("Model evaluation:")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1 Score : {f1:.4f}")

        return {
            "conversation_metrics": {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "predictions": predictions,
            "true_labels": true_labels
        }

In [11]:
# Train conversation model
X_train_embed = embedder.encode(df_conversations["text"].tolist())
clf_convo = MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42)
clf_convo.fit(X_train_embed, df_conversations["label"])

# Train author model
X_author_embed = embedder.encode(df_interventions["intervention"].tolist())
clf_author = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
clf_author.fit(X_author_embed, df_interventions["label"])

# Create the pipeline and run it in the test set
pipeline = PredatorDetectionPipeline(
    conversation_embedder=embedder,
    conversation_classifier=clf_convo,
    author_embedder=embedder,
    author_classifier=clf_author
)

results = pipeline.run_pipeline(root_test, predator_ids_test)

Model evaluation:
Accuracy : 0.9935
Precision: 0.9757
Recall   : 0.9447
F1 Score : 0.9599


Using TF_IDF we obtained:


Accuracy : 0.9930

Precision: 0.9972

Recall   : 0.9180

F1 Score : 0.9560


The original paper obtained precision of 0.9804, recall of 0.7874 and F1 score of 0.8734. 