# **Sexual Predator Identification Baseline**

### 0. 1. Import libraries.

In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import html
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.base import clone

### 0.2. Load data. Predator's ID and Chats.

In [2]:
with open("pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt") as f:
    predator_ids_train = set(f.read().splitlines())

tree_train = ET.parse("pan12-sexual-predator-identification-training-corpus-2012-05-01.xml")
root_train = tree_train.getroot()

### 1. **Filtering stage.**

Conversations with only one participant, fewer than six interventions per user or long sequences of unrecognized characters (likely images) were discarded.

In [3]:
total_conversations = 0
discarded_messages = []
conversations_clean = []
predators_in_filtered_conversations = set()

junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

# Conversations loop.
for conversation in root_train.findall("conversation"):
    total_conversations += 1
    authors = defaultdict(list)
    all_texts = []

    # Messages loop. Keep only those that pass the filters.
    for message in conversation.findall("message"):
        author_el = message.find("author")
        text_el = message.find("text")

        if author_el is None or author_el.text is None:
            continue
        if text_el is None or text_el.text is None:
            continue
        
        author_id = author_el.text.strip()
        text = text_el.text.strip()
        text = html.unescape(text) # Substitue &amp, &lt;, &gt; etc. with their characters.

        # Long sequences of unrecognized characters
        if len(text) > 20 and junk_pattern.search(text):
            discarded_messages.append(text) 
            continue

        authors[author_id].append(text)
        all_texts.append(text)

    # Conversations with only one participant
    if len(authors) <= 1:
        continue
    # Conversations with fewer than six interventions per user
    if any(len(msgs) < 6 for msgs in authors.values()):
        continue

    # If the conversation passes all filters, we keep it.
    conversations_clean.append({
        "conversation_id": conversation.get("id"),
        "authors": list(authors.keys()),
        "text": " ".join(all_texts),
        "messages_by_author": dict(authors)
    })

    # Check if any of the authors are predators
    for author_id in authors:
        if author_id in predator_ids_train:
            predators_in_filtered_conversations.add(author_id)

In [4]:
# Number of chat conversations and sexual predators in the original data vs the filtered data. Examples of discarded messages.
print(f"Total conversations: {total_conversations}")
print(f"Valid conversations after filtering: {len(conversations_clean)}")
print(f"Predators in the original list: {len(predator_ids_train)}")
print(f"Predators appearing in filtered conversations: {len(predators_in_filtered_conversations)}")

print("\nSome examples of discarded messages:")
for i, msg in enumerate(discarded_messages[:100], 1):
    print(f"{i:>2}. {msg}")

Total conversations: 66927
Valid conversations after filtering: 8210
Predators in the original list: 142
Predators appearing in filtered conversations: 137

Some examples of discarded messages:
 1. Cam = not such good conversation.
 2. Well, even though I'm glad to chat w/ ya & you're quite...probably better I let you connect w/ someone a bit younger, eh?
 3. sets mode: +oo tantek ChanServ
 4. sets mode: +o ChanServ
 5. for t = 1 : T-1
    w(t) = (wi-wf)*(T-t)/T + wf ;
    for i = 1:N
        v_a0(i,t+1) = w(t)*v_a0(t) + p1*r1*(x(lbest)-x(i)) + p2*r2*(x(gbest)-x(i)) ;
        p_a0(i,t+1) = p_a0(i,t) + v_a0(i,t+1);
        v_a1(i,t+1) = w(t)*v_a1(t) + p1*r1*(x(lbest)-x(i)) + p2*r2*(x(gbest)-x(i)) ;
        p_a1(i,t+1) = p_a1(i,t) + v_a1(i,t+1);
        v_a2(i,t+1) = w(t)*v_a2(t) + p1*r1*(x(lbest)-x(i)) + p2*r2*(x(gbest)-x(i)) ;
        p_a2(i,t+1) = p_a2(i,t) + v_a2(i,t+1);
        v_a3(i,t+1) = w(t)*v_a3(t) + p1*r1*(x(lbest)-x(i)) + p2*r2*(x(gbest)-x(i)) ;
        p_a3(i,t+1) = p_a3(i,

### 2. **Labelling data.**

Label all chat conversations as suspicious if they involve at least one predator (SCI task). 

For these suspicious conversations, separate and label the interventions as predator or victim messages (VFP task).


In [5]:
conversations = [] # List to hold conversations with labels
interventions = [] # List to hold interventions with labels

for convo in conversations_clean:
    convo_authors = convo["authors"]
    label = 1 if any(author in predator_ids_train for author in convo_authors) else 0

    # Conversations labelled
    conversations.append({
        "text": convo["text"],
        "label": label
    })

    # If the conversation has predators, label each intervention
    if label == 1:
        for author, msgs in convo["messages_by_author"].items():
            author_label = 1 if author in predator_ids_train else 0
            full_intervention = " ".join(msgs)
            # Interventions labelled
            interventions.append({
                "intervention": full_intervention,
                "label": author_label
            })

df_conversations = pd.DataFrame(conversations)
df_interventions = pd.DataFrame(interventions)

# Print the number of conversations with and without predators
label_counts = df_conversations["label"].value_counts()
print(f"Conversations with predators (label=1): {label_counts.get(1, 0)}")
print(f"Conversations without predators (label=0): {label_counts.get(0, 0)}")

Conversations with predators (label=1): 856
Conversations without predators (label=0): 7354


### 3. **Suspicious Conversations Identification (SCI) stage.**

#### Linear SMV, Polynomial SMV, NN(one hidden layer of 10 units)

Every model was trained using a Bag-of-Words (BoW) representation, first with boolean weighting and then with a TF-IDF weighting scheme.

In [6]:
models_SCI = {
    "LinearSVC BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', LinearSVC(dual='auto'))
    ]),
    "LinearSVC TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', LinearSVC(dual='auto'))
    ]),
    "PolySVC BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', SVC(kernel='poly', degree=2, C=1.0))
    ]),
    "PolySVC TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', SVC(kernel='poly', degree=2, C=1.0))
    ]),
    "MLP BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42))
    ]),
    "MLP TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42))
    ]),
}

n_splits = 20
results = {}

print(f"SCI Task:\n")
for model_name, pipeline in models_SCI.items():
    
    accuracies, precisions, recalls, f1s = [], [], [], []
    
    for seed in range(n_splits):
        X_train, X_test, y_train, y_test = train_test_split(
            df_conversations["text"], df_conversations["label"],
            test_size=0.2,
            random_state=seed,
            stratify=df_conversations["label"]
        )
        
        model = clone(pipeline)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(report['macro avg']['precision'])
        recalls.append(report['macro avg']['recall'])
        f1s.append(report['macro avg']['f1-score'])
    
    results[model_name] = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "precision_mean": np.mean(precisions),
        "precision_std": np.std(precisions),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s)
    }

for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['accuracy_mean']:.4f} ± {metrics['accuracy_std']:.4f}")
    print(f"Precision (macro): {metrics['precision_mean']:.4f} ± {metrics['precision_std']:.4f}")
    print(f"Recall (macro): {metrics['recall_mean']:.4f} ± {metrics['recall_std']:.4f}")
    print(f"F1-score (macro): {metrics['f1_mean']:.4f} ± {metrics['f1_std']:.4f}")


SCI Task:


Model: LinearSVC BoW Boolean
Accuracy: 0.9894 ± 0.0018
Precision (macro): 0.9784 ± 0.0063
Recall (macro): 0.9644 ± 0.0081
F1-score (macro): 0.9712 ± 0.0050

Model: LinearSVC TF-IDF
Accuracy: 0.9927 ± 0.0020
Precision (macro): 0.9870 ± 0.0053
Recall (macro): 0.9736 ± 0.0083
F1-score (macro): 0.9801 ± 0.0056

Model: PolySVC BoW Boolean
Accuracy: 0.9753 ± 0.0025
Precision (macro): 0.9727 ± 0.0079
Recall (macro): 0.8924 ± 0.0119
F1-score (macro): 0.9276 ± 0.0079

Model: PolySVC TF-IDF
Accuracy: 0.9833 ± 0.0024
Precision (macro): 0.9816 ± 0.0059
Recall (macro): 0.9277 ± 0.0106
F1-score (macro): 0.9525 ± 0.0072

Model: MLP BoW Boolean
Accuracy: 0.9892 ± 0.0062
Precision (macro): 0.9832 ± 0.0069
Recall (macro): 0.9580 ± 0.0294
F1-score (macro): 0.9697 ± 0.0185

Model: MLP TF-IDF
Accuracy: 0.9945 ± 0.0022
Precision (macro): 0.9895 ± 0.0046
Recall (macro): 0.9810 ± 0.0095
F1-score (macro): 0.9851 ± 0.0060


The higher the polynomial degree, the worse the performance.

Additionally, reducing the number of features (i.e., BoW dimensionality) negatively impacts performance. This was confirmed both through our own experiments—where limiting the number of features progressively led to lower accuracy—and in the original paper, where the same pattern was observed. Without dimensionality reduccion, 46137 features were used.

In the original paper, the best model for this task was a TF-IDF Linear SVM without dimensionality reduction, achieving an accuracy of 0.9883. Their Binary NN model without dimensionality reduction reached 0.9874 accuracy. In our case, the **TF-IDF NN without dimensionality reduction**  was the best model achieving an accuracy of 0.9945.

### 4. **Victim From Predator disclosure (VFP) stage.**

#### Linear SMV, Polynomial SMV, NN(one hidden layer of 10 units)

Every model was trained using a Bag-of-Words (BoW) representation, first with boolean weighting and then with a TF-IDF weighting scheme.

In [7]:
models_VPF = {
    "LinearSVC BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', LinearSVC(dual='auto'))
    ]),
    "LinearSVC TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', LinearSVC(dual='auto'))
    ]),
    "PolySVC BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', SVC(kernel='poly', degree=2, C=1.0))
    ]),
    "PolySVC TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', SVC(kernel='poly', degree=2, C=1.0))
    ]),
    "MLP BoW Boolean": Pipeline([
        ('vectorizer', CountVectorizer(binary=True)),
        ('classifier', MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42))
    ]),
    "MLP TF-IDF": Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', MLPClassifier(hidden_layer_sizes=(10,), max_iter=500, random_state=42))
    ]),
}

n_splits = 20
results = {}

print(f"VFP Task:\n")
for model_name, pipeline in models_VPF.items():
    
    accuracies, precisions, recalls, f1s = [], [], [], []
    
    for seed in range(n_splits):
        X_train, X_test, y_train, y_test = train_test_split(
            df_interventions["intervention"], df_interventions["label"],
            test_size=0.2,
            random_state=seed,
            stratify=df_interventions["label"]
        )
        
        model = clone(pipeline)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(report['macro avg']['precision'])
        recalls.append(report['macro avg']['recall'])
        f1s.append(report['macro avg']['f1-score'])
    
    results[model_name] = {
        "accuracy_mean": np.mean(accuracies),
        "accuracy_std": np.std(accuracies),
        "precision_mean": np.mean(precisions),
        "precision_std": np.std(precisions),
        "recall_mean": np.mean(recalls),
        "recall_std": np.std(recalls),
        "f1_mean": np.mean(f1s),
        "f1_std": np.std(f1s)
    }

for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['accuracy_mean']:.4f} ± {metrics['accuracy_std']:.4f}")
    print(f"Precision (macro): {metrics['precision_mean']:.4f} ± {metrics['precision_std']:.4f}")
    print(f"Recall (macro): {metrics['recall_mean']:.4f} ± {metrics['recall_std']:.4f}")
    print(f"F1-score (macro): {metrics['f1_mean']:.4f} ± {metrics['f1_std']:.4f}")


VFP Task:


Model: LinearSVC BoW Boolean
Accuracy: 0.9064 ± 0.0133
Precision (macro): 0.9070 ± 0.0134
Recall (macro): 0.9064 ± 0.0133
F1-score (macro): 0.9064 ± 0.0133

Model: LinearSVC TF-IDF
Accuracy: 0.9394 ± 0.0114
Precision (macro): 0.9397 ± 0.0114
Recall (macro): 0.9394 ± 0.0114
F1-score (macro): 0.9393 ± 0.0114

Model: PolySVC BoW Boolean
Accuracy: 0.8491 ± 0.0166
Precision (macro): 0.8661 ± 0.0147
Recall (macro): 0.8492 ± 0.0166
F1-score (macro): 0.8473 ± 0.0172

Model: PolySVC TF-IDF
Accuracy: 0.9369 ± 0.0111
Precision (macro): 0.9372 ± 0.0110
Recall (macro): 0.9369 ± 0.0111
F1-score (macro): 0.9369 ± 0.0111

Model: MLP BoW Boolean
Accuracy: 0.9388 ± 0.0095
Precision (macro): 0.9392 ± 0.0096
Recall (macro): 0.9388 ± 0.0095
F1-score (macro): 0.9388 ± 0.0095

Model: MLP TF-IDF
Accuracy: 0.9446 ± 0.0089
Precision (macro): 0.9451 ± 0.0089
Recall (macro): 0.9446 ± 0.0089
F1-score (macro): 0.9446 ± 0.0089


For the **VFP task**, the **best-performing model** was a **NN** using a Bag-of-Words (BoW) representation **with TF-IDF weighting and dimensionality reduction** — from 9610 **features** down to **5000**. As previously observed, increasing the polynomial degree in SVM leads to poorer performance.

In the original paper, the best model for this task was a Boolean NN without dimensionality reduction, achieving an accuracy of 0.9407. In our case, the best model achieved an accuracy of 0.9446.

### 5. **Evaluate test data using best performance model:** SCI(NN-TF_IDF)& VFP(NN-TF_IDF-DimensionalityReduction).


In [8]:
with open("pan12-sexual-predator-identification-groundtruth-problem1.txt") as f:
    predator_ids_test = set(f.read().splitlines())

tree_test = ET.parse("pan12-sexual-predator-identification-test-corpus-2012-05-17.xml")
root_test = tree_test.getroot()

In [9]:
class PredatorDetectionPipeline:
    def __init__(self, conversation_model, author_model):
        self.conversation_model = conversation_model
        self.author_model = author_model
        self.junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

    def _filter(self, root):
        filtered_conversations = []
        for conversation in root.findall("conversation"):
            authors = defaultdict(list)
            all_texts = []

            for message in conversation.findall("message"):
                author_el = message.find("author")
                text_el = message.find("text")

                if author_el is None or author_el.text is None:
                    continue
                if text_el is None or text_el.text is None:
                    continue

                author_id = author_el.text.strip()
                text = html.unescape(text_el.text.strip())

                if len(text) > 20 and self.junk_pattern.search(text):
                    continue

                authors[author_id].append(text)
                all_texts.append(text)

            if len(authors) <= 1:
                continue
            if any(len(msgs) < 6 for msgs in authors.values()):
                continue

            filtered_conversations.append({
                "conversation_id": conversation.get("id"),
                "authors": list(authors.keys()),
                "text": " ".join(all_texts),
                "messages_by_author": dict(authors),
                "xml": conversation
            })

        return filtered_conversations

    def _true_labels(self, conversations, predator_ids):
        true_labels = []
        for convo in conversations:
            convo_authors = convo["authors"]
            predator_in_convo = [author for author in convo_authors if author in predator_ids]

            if predator_in_convo:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": True,
                    "predator": predator_in_convo[0]
                })
            else:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": False,
                    "predator": None
                })

        return true_labels

    def _predict(self, conversation_data):
        text = conversation_data["text"]
        is_suspicious = self.conversation_model.predict([text])[0]
        if not is_suspicious:
            return {
                "conversation_id": conversation_data["conversation_id"],
                "suspicious": False,
                "predator": None
            }

        author_probs = []
        for author_id, msgs in conversation_data["messages_by_author"].items():
            joined_msgs = " ".join(msgs)
            score = self.author_model.predict_proba([joined_msgs])[0][1]
            author_probs.append((author_id, score))

        predator_id = max(author_probs, key=lambda x: x[1])[0]

        return {
            "conversation_id": conversation_data["conversation_id"],
            "suspicious": True,
            "predator": predator_id
        }

    def run_pipeline(self, root, predator_ids):
        # 1. Filter conversations
        conversations = self._filter(root)

        # 2. Generate true labels
        true_labels = self._true_labels(conversations, predator_ids)

        # 3. Predict with the models
        predictions = []
        for convo in conversations:
            resultado = self._predict(convo)
            predictions.append(resultado)

        # 4. Compare true labels vs. predictions
        y_true = [et["suspicious"] for et in true_labels]
        y_pred = [pr["suspicious"] for pr in predictions]

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("Combined model: SCI(NN-TF_IDF)& VFP(NN-TF_IDF-DimensionalityReduction)")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1 Score : {f1:.4f}")

        return {
            "conversation_metrics": {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "predictions": predictions,
            "true_labels": true_labels
        }

In [10]:
# Train the best models using all the training data
conversation_model = models_SCI["MLP TF-IDF"].fit(df_conversations["text"], df_conversations["label"])
author_model = models_VPF["MLP TF-IDF"].fit(df_interventions["intervention"], df_interventions["label"])

In [11]:
pipeline = PredatorDetectionPipeline(conversation_model, author_model)
results = pipeline.run_pipeline(root_test, predator_ids_test)

Combined model: SCI(NN-TF_IDF)& VFP(NN-TF_IDF-DimensionalityReduction)
Accuracy : 0.9930
Precision: 0.9972
Recall   : 0.9180
F1 Score : 0.9560


The original paper obtained precision of 0.9804, recall of 0.7874 and F1 score of 0.8734.