# **Sexual Predator Identification CNN**

### 0. 1. Import libraries.

In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import html
import pandas as pd
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

2025-08-07 11:29:49.107953: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 0.2. Load data. Predator's ID and Chats.

In [2]:
with open("pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt") as f:
    predator_ids_train = set(f.read().splitlines())

tree_train = ET.parse("pan12-sexual-predator-identification-training-corpus-2012-05-01.xml")
root_train = tree_train.getroot()

### 1. **Filtering stage.**

Conversations with only one participant, fewer than six interventions per user or long sequences of unrecognized characters (likely images) were discarded.

In [3]:
#total_conversations = 0
#discarded_messages = []
conversations_clean = []
#predators_in_filtered_conversations = set()

junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

# Conversations loop.
for conversation in root_train.findall("conversation"):
    #total_conversations += 1
    authors = defaultdict(list)
    all_texts = []

    # Messages loop. Keep only those that pass the filters.
    for message in conversation.findall("message"):
        author_el = message.find("author")
        text_el = message.find("text")

        if author_el is None or author_el.text is None:
            continue
        if text_el is None or text_el.text is None:
            continue
        
        author_id = author_el.text.strip()
        text = text_el.text.strip()
        text = html.unescape(text) # Substitue &amp, &lt;, &gt; etc. with their characters.

        # Long sequences of unrecognized characters
        if len(text) > 20 and junk_pattern.search(text):
            #discarded_messages.append(text) 
            continue

        authors[author_id].append(text)
        all_texts.append(text)

    # Conversations with only one participant
    if len(authors) <= 1:
        continue
    # Conversations with fewer than six interventions per user
    if any(len(msgs) < 6 for msgs in authors.values()):
        continue

    # If the conversation passes all filters, we keep it.
    conversations_clean.append({
        "conversation_id": conversation.get("id"),
        "authors": list(authors.keys()),
        "text": " ".join(all_texts),
        "messages_by_author": dict(authors)
    })

    # Check if any of the authors are predators
    #for author_id in authors:
        #if author_id in predator_ids_train:
            #predators_in_filtered_conversations.add(author_id)

### 2. **Labelling data.**

Label all chat conversations as suspicious if they involve at least one predator (SCI task). 

For these suspicious conversations, separate and label the interventions as predator or victim messages (VFP task).


In [4]:
conversations = [] # List to hold conversations with labels
interventions = [] # List to hold interventions with labels

for convo in conversations_clean:
    convo_authors = convo["authors"]
    label = 1 if any(author in predator_ids_train for author in convo_authors) else 0

    # Conversations labelled
    conversations.append({
        "text": convo["text"],
        "label": label
    })

    # If the conversation has predators, label each intervention
    if label == 1:
        for author, msgs in convo["messages_by_author"].items():
            author_label = 1 if author in predator_ids_train else 0
            full_intervention = " ".join(msgs)
            # Interventions labelled
            interventions.append({
                "intervention": full_intervention,
                "label": author_label
            })

df_conversations = pd.DataFrame(conversations)
df_interventions = pd.DataFrame(interventions)

### 3. **Suspicious Conversations Identification (SCI) stage using CNN**

In [5]:
X = df_conversations["text"]
y = df_conversations["label"]

# Define preprocessing parameters
SCI_vocab_size = 20000  # total number of words considered
SCI_max_length = 500    # dimensionality of each vectorized chat

# Tokenization and padding
SCI_tokenizer = Tokenizer(num_words=SCI_vocab_size, oov_token="<OOV>")
SCI_tokenizer.fit_on_texts(X)
X_seq = SCI_tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=SCI_max_length    # dimensionality of each vectorized chat
, padding='post', truncating='post')

# CNN model creation function
def create_cnn_model(vocab_size=SCI_vocab_size, max_length=SCI_max_length, embedding_dim=128, dropout_rate=0.5):
    model = Sequential([
        Input(shape=(max_length,)),
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier for compatibility with GridSearchCV
clf = KerasClassifier(
    model=create_cnn_model,
    vocab_size=SCI_vocab_size,
    max_length=SCI_max_length,
    verbose=0,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
    validation_split=0.1
)

# Define the hyperparameter grid for optimization
param_grid = {
    'model__embedding_dim': [64, 128],
    'model__dropout_rate': [0.3, 0.5],
    'batch_size': [32, 64],
    'epochs': [5, 10]
}

# Perform grid search with cross-validation
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, n_jobs=1, verbose=1)
grid.fit(X_pad, y)

# Print the best hyperparameters and accuracy score
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best score (accuracy): {grid.best_score_:.4f}")

# Get the best model from the search
best_model_SCI = grid.best_estimator_

# Retrain the best estimator on the full training data
best_model_SCI.fit(X_pad, y)

Fitting 2 folds for each of 16 candidates, totalling 32 fits

Best parameters: {'batch_size': 32, 'epochs': 10, 'model__dropout_rate': 0.3, 'model__embedding_dim': 128}
Best score (accuracy): 0.9865


### 4. **Victim From Predator disclosure (VFP) stage using CNN**

In [6]:
X = df_interventions["intervention"]
y = df_interventions["label"]

# Define preprocessing parameters
VFP_vocab_size = 20000  # total number of words considered
VFP_max_length = 300    # dimensionality of each vectorized intervention

# Tokenization and padding
VFP_tokenizer = Tokenizer(num_words=VFP_vocab_size, oov_token="<OOV>")
VFP_tokenizer.fit_on_texts(X)
X_seq = VFP_tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=VFP_max_length, padding='post', truncating='post')

# CNN model creation function
def create_cnn_model(vocab_size=VFP_vocab_size, max_length=VFP_max_length, embedding_dim=128, dropout_rate=0.5):
    model = Sequential([
        Input(shape=(max_length,)),
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier for compatibility with GridSearchCV
clf = KerasClassifier(
    model=create_cnn_model,
    vocab_size=VFP_vocab_size,
    max_length=VFP_max_length,
    verbose=0,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
    validation_split=0.1
)

# Define the hyperparameter grid for optimization
param_grid = {
    'model__embedding_dim': [64, 128],
    'model__dropout_rate': [0.3, 0.5],
    'batch_size': [32, 64],
    'epochs': [5, 10]
}

# Perform grid search with cross-validation
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, n_jobs=1, verbose=1)

# Train the model with grid search
grid.fit(X_pad, y)

# Print the best hyperparameters and accuracy score
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best score (accuracy): {grid.best_score_:.4f}")

# Get the best model from the search
best_model_VFP = grid.best_estimator_

# Retrain the best estimator on the full training data
best_model_VFP.fit(X_pad, y)

Fitting 2 folds for each of 16 candidates, totalling 32 fits

Best parameters: {'batch_size': 32, 'epochs': 10, 'model__dropout_rate': 0.3, 'model__embedding_dim': 128}
Best score (accuracy): 0.9100


### 5. **Evaluate test data using best estimators.**


In [7]:
with open("pan12-sexual-predator-identification-groundtruth-problem1.txt") as f:
    predator_ids_test = set(f.read().splitlines())

tree_test = ET.parse("pan12-sexual-predator-identification-test-corpus-2012-05-17.xml")
root_test = tree_test.getroot()

In [20]:
class PredatorDetectionPipeline:
    def __init__(self, conversation_model, conversation_tokenizer, conversation_max_length, author_model, author_tokenizer, author_max_length):
        self.conversation_model = conversation_model
        self.conversation_tokenizer = conversation_tokenizer
        self.conversation_max_length = conversation_max_length
        self.author_model = author_model
        self.author_tokenizer = author_tokenizer
        self.author_max_length = author_max_length
        self.junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

    def _filter(self, root):
        filtered_conversations = []
        for conversation in root.findall("conversation"):
            authors = defaultdict(list)
            all_texts = []

            for message in conversation.findall("message"):
                author_el = message.find("author")
                text_el = message.find("text")

                if author_el is None or author_el.text is None:
                    continue
                if text_el is None or text_el.text is None:
                    continue

                author_id = author_el.text.strip()
                text = html.unescape(text_el.text.strip())

                if len(text) > 20 and self.junk_pattern.search(text):
                    continue

                authors[author_id].append(text)
                all_texts.append(text)

            if len(authors) <= 1:
                continue
            if any(len(msgs) < 6 for msgs in authors.values()):
                continue

            filtered_conversations.append({
                "conversation_id": conversation.get("id"),
                "authors": list(authors.keys()),
                "text": " ".join(all_texts),
                "messages_by_author": dict(authors),
                "xml": conversation
            })

        return filtered_conversations

    def _true_labels(self, conversations, predator_ids):
        true_labels = []
        for convo in conversations:
            convo_authors = convo["authors"]
            predator_in_convo = [author for author in convo_authors if author in predator_ids]

            if predator_in_convo:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": True,
                    "predator": predator_in_convo[0]
                })
            else:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": False,
                    "predator": None
                })

        return true_labels

    def _predict(self, conversation_data):

        seq = self.conversation_tokenizer.texts_to_sequences([conversation_data["text"]])
        pad = pad_sequences(seq, maxlen=self.conversation_max_length, padding='post', truncating='post')
        is_suspicious = self.conversation_model.predict(pad)[0] > 0.5
    
        if not is_suspicious:
            return {
                "conversation_id": conversation_data["conversation_id"],
                "suspicious": False,
                "predator": None
            }

        author_probs = []
        for author_id, msgs in conversation_data["messages_by_author"].items():
            joined_msgs = " ".join(msgs)
            seq_author = self.author_tokenizer.texts_to_sequences([joined_msgs])
            pad_author = pad_sequences(seq_author, maxlen=self.author_max_length, padding='post', truncating='post')
            score = self.author_model.predict_proba(pad_author)[1]
            author_probs.append((author_id, score))

        predator_id = max(author_probs, key=lambda x: x[1])[0]

        return {
            "conversation_id": conversation_data["conversation_id"],
            "suspicious": True,
            "predator": predator_id
        }

    def run_pipeline(self, root, predator_ids):
        # 1. Filter conversations
        conversations = self._filter(root)

        # 2. Generate true labels
        true_labels = self._true_labels(conversations, predator_ids)

        # 3. Predict with the models
        predictions = []
        for convo in conversations:
            resultado = self._predict(convo)
            predictions.append(resultado)

        # 4. Compare true labels vs. predictions
        y_true = [et["suspicious"] for et in true_labels]
        y_pred = [pr["suspicious"] for pr in predictions]

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("Model evaluation:")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1 Score : {f1:.4f}")

        return {
            "conversation_metrics": {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "predictions": predictions,
            "true_labels": true_labels
        }

In [21]:
pipeline = PredatorDetectionPipeline(best_model_SCI, SCI_tokenizer, SCI_max_length, best_model_VFP, VFP_tokenizer, VFP_max_length)
results = pipeline.run_pipeline(root_test, predator_ids_test)

Model evaluation:
Accuracy : 0.9905
Precision: 0.9915
Recall   : 0.8926
F1 Score : 0.9394


The original paper obtained precision of 0.9804, recall of 0.7874 and F1 score of 0.8734.