# **Sexual Predator Identification Baseline**

### 0. 1. Import libraries.

In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import html
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

### 0.2. Load data. Predator's ID and Chats.

In [2]:
with open("pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt") as f:
    predator_ids_train = set(f.read().splitlines())

tree_train = ET.parse("pan12-sexual-predator-identification-training-corpus-2012-05-01.xml")
root_train = tree_train.getroot()

### 1. **Filtering stage.**

Conversations with only one participant, fewer than six interventions per user or long sequences of unrecognized characters (likely images) were discarded.

In [3]:
#total_conversations = 0
#discarded_messages = []
conversations_clean = []
#predators_in_filtered_conversations = set()

junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

# Conversations loop.
for conversation in root_train.findall("conversation"):
    #total_conversations += 1
    authors = defaultdict(list)
    all_texts = []

    # Messages loop. Keep only those that pass the filters.
    for message in conversation.findall("message"):
        author_el = message.find("author")
        text_el = message.find("text")

        if author_el is None or author_el.text is None:
            continue
        if text_el is None or text_el.text is None:
            continue
        
        author_id = author_el.text.strip()
        text = text_el.text.strip()
        text = html.unescape(text) # Substitue &amp, &lt;, &gt; etc. with their characters.

        # Long sequences of unrecognized characters
        if len(text) > 20 and junk_pattern.search(text):
            #discarded_messages.append(text) 
            continue

        authors[author_id].append(text)
        all_texts.append(text)

    # Conversations with only one participant
    if len(authors) <= 1:
        continue
    # Conversations with fewer than six interventions per user
    if any(len(msgs) < 6 for msgs in authors.values()):
        continue

    # If the conversation passes all filters, we keep it.
    conversations_clean.append({
        "conversation_id": conversation.get("id"),
        "authors": list(authors.keys()),
        "text": " ".join(all_texts),
        "messages_by_author": dict(authors)
    })

    # Check if any of the authors are predators
    #for author_id in authors:
        #if author_id in predator_ids_train:
            #predators_in_filtered_conversations.add(author_id)

### 2. **Labelling data.**

Label all chat conversations as suspicious if they involve at least one predator (SCI task). 

For these suspicious conversations, separate and label the interventions as predator or victim messages (VFP task).


In [4]:
conversations = [] # List to hold conversations with labels
interventions = [] # List to hold interventions with labels

for convo in conversations_clean:
    convo_authors = convo["authors"]
    label = 1 if any(author in predator_ids_train for author in convo_authors) else 0

    # Conversations labelled
    conversations.append({
        "text": convo["text"],
        "label": label
    })

    # If the conversation has predators, label each intervention
    if label == 1:
        for author, msgs in convo["messages_by_author"].items():
            author_label = 1 if author in predator_ids_train else 0
            full_intervention = " ".join(msgs)
            # Interventions labelled
            interventions.append({
                "intervention": full_intervention,
                "label": author_label
            })

df_conversations = pd.DataFrame(conversations)
df_interventions = pd.DataFrame(interventions)

### 3. **Suspicious Conversations Identification (SCI) stage using Logistic Regression**

In [5]:
X = df_conversations["text"]
y = df_conversations["label"]

# Pipeline model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(penalty='l2', solver='lbfgs'))
])

# Define the hyperparameter grid for optimization
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 5],
    'tfidf__max_features': [None, 5000, 10000],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__max_iter': [100, 200, 500]
}

# Perform grid search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X, y)

# Print the best hyperparameters and accuracy score
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best score (accuracy): {grid.best_score_:.4f}")

# Get the best model from the search
best_model_SCI = grid.best_estimator_

# Retrain the best estimator on the full training data
best_model_SCI.fit(X, y)


Best parameters: {'logreg__C': 10, 'logreg__max_iter': 100, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}
Best score (accuracy): 0.9904


0,1,2
,steps,"[('tfidf', ...), ('logreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### 4. **Victim From Predator disclosure (VFP) stage using Logistic Regression**

In [6]:
X = df_interventions["intervention"]
y = df_interventions["label"]


# Pipeline model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(penalty='l2', solver='lbfgs'))
])

# Define the hyperparameter grid for optimization
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 5],
    'tfidf__max_features': [None, 5000, 10000],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__max_iter': [100, 200, 500]
}

# Perform grid search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X, y)

# Print the best hyperparameters and accuracy score
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best score (accuracy): {grid.best_score_:.4f}")

# Get the best model from the search
best_model_VFP = grid.best_estimator_

# Retrain the best estimator on the full training data
best_model_VFP.fit(X, y)


Best parameters: {'logreg__C': 10, 'logreg__max_iter': 100, 'tfidf__max_df': 0.9, 'tfidf__max_features': None, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)}
Best score (accuracy): 0.9504


0,1,2
,steps,"[('tfidf', ...), ('logreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### 5. **Evaluate test data using best estimators.**

In [7]:
with open("pan12-sexual-predator-identification-groundtruth-problem1.txt") as f:
    predator_ids_test = set(f.read().splitlines())

tree_test = ET.parse("pan12-sexual-predator-identification-test-corpus-2012-05-17.xml")
root_test = tree_test.getroot()

In [8]:
class PredatorDetectionPipeline:
    def __init__(self, conversation_model, author_model):
        self.conversation_model = conversation_model
        self.author_model = author_model
        self.junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

    def _filter(self, root):
        filtered_conversations = []
        for conversation in root.findall("conversation"):
            authors = defaultdict(list)
            all_texts = []

            for message in conversation.findall("message"):
                author_el = message.find("author")
                text_el = message.find("text")

                if author_el is None or author_el.text is None:
                    continue
                if text_el is None or text_el.text is None:
                    continue

                author_id = author_el.text.strip()
                text = html.unescape(text_el.text.strip())

                if len(text) > 20 and self.junk_pattern.search(text):
                    continue

                authors[author_id].append(text)
                all_texts.append(text)

            if len(authors) <= 1:
                continue
            if any(len(msgs) < 6 for msgs in authors.values()):
                continue

            filtered_conversations.append({
                "conversation_id": conversation.get("id"),
                "authors": list(authors.keys()),
                "text": " ".join(all_texts),
                "messages_by_author": dict(authors),
                "xml": conversation
            })

        return filtered_conversations

    def _true_labels(self, conversations, predator_ids):
        true_labels = []
        for convo in conversations:
            convo_authors = convo["authors"]
            predator_in_convo = [author for author in convo_authors if author in predator_ids]

            if predator_in_convo:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": True,
                    "predator": predator_in_convo[0]
                })
            else:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": False,
                    "predator": None
                })

        return true_labels

    def _predict(self, conversation_data):
        text = conversation_data["text"]
        is_suspicious = self.conversation_model.predict([text])[0]
        if not is_suspicious:
            return {
                "conversation_id": conversation_data["conversation_id"],
                "suspicious": False,
                "predator": None
            }

        author_probs = []
        for author_id, msgs in conversation_data["messages_by_author"].items():
            joined_msgs = " ".join(msgs)
            score = self.author_model.predict_proba([joined_msgs])[0][1]
            author_probs.append((author_id, score))

        predator_id = max(author_probs, key=lambda x: x[1])[0]

        return {
            "conversation_id": conversation_data["conversation_id"],
            "suspicious": True,
            "predator": predator_id
        }

    def run_pipeline(self, root, predator_ids):
        # 1. Filter conversations
        conversations = self._filter(root)

        # 2. Generate true labels
        true_labels = self._true_labels(conversations, predator_ids)

        # 3. Predict with the models
        predictions = []
        for convo in conversations:
            resultado = self._predict(convo)
            predictions.append(resultado)

        # 4. Compare true labels vs. predictions
        y_true = [et["suspicious"] for et in true_labels]
        y_pred = [pr["suspicious"] for pr in predictions]

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("Model evaluation:")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1 Score : {f1:.4f}")

        return {
            "conversation_metrics": {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "predictions": predictions,
            "true_labels": true_labels
        }

In [9]:
pipeline = PredatorDetectionPipeline(best_model_SCI, best_model_VFP)
results = pipeline.run_pipeline(root_test, predator_ids_test)

Model evaluation:
Accuracy : 0.9912
Precision: 0.9986
Recall   : 0.8951
F1 Score : 0.9440


The original paper obtained precision of 0.9804, recall of 0.7874 and F1 score of 0.8734.