# **Sexual Predator Identification Ensamble**

### 0. 1. Import libraries.

In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import html
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

2025-08-10 19:12:00.986159: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754853121.011664 3094594 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754853121.018771 3094594 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754853121.043728 3094594 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754853121.043796 3094594 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754853121.043798 3094594 computation_placer.cc:177] computation placer alr

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"  # Excluye la GPU 3

### 0.2. Load data. Predator's ID and Chats.

In [3]:
with open("pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt") as f:
    predator_ids_train = set(f.read().splitlines())

tree_train = ET.parse("pan12-sexual-predator-identification-training-corpus-2012-05-01.xml")
root_train = tree_train.getroot()

### 1. **Filtering stage.**

Conversations with only one participant, fewer than six interventions per user or long sequences of unrecognized characters (likely images) were discarded.

In [4]:
#total_conversations = 0
#discarded_messages = []
conversations_clean = []
#predators_in_filtered_conversations = set()

junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

# Conversations loop.
for conversation in root_train.findall("conversation"):
    #total_conversations += 1
    authors = defaultdict(list)
    all_texts = []

    # Messages loop. Keep only those that pass the filters.
    for message in conversation.findall("message"):
        author_el = message.find("author")
        text_el = message.find("text")

        if author_el is None or author_el.text is None:
            continue
        if text_el is None or text_el.text is None:
            continue
        
        author_id = author_el.text.strip()
        text = text_el.text.strip()
        text = html.unescape(text) # Substitue &amp, &lt;, &gt; etc. with their characters.

        # Long sequences of unrecognized characters
        if len(text) > 20 and junk_pattern.search(text):
            #discarded_messages.append(text) 
            continue

        authors[author_id].append(text)
        all_texts.append(text)

    # Conversations with only one participant
    if len(authors) <= 1:
        continue
    # Conversations with fewer than six interventions per user
    if any(len(msgs) < 6 for msgs in authors.values()):
        continue

    # If the conversation passes all filters, we keep it.
    conversations_clean.append({
        "conversation_id": conversation.get("id"),
        "authors": list(authors.keys()),
        "text": " ".join(all_texts),
        "messages_by_author": dict(authors)
    })

    # Check if any of the authors are predators
    #for author_id in authors:
        #if author_id in predator_ids_train:
            #predators_in_filtered_conversations.add(author_id)

### 2. **Labelling data.**

Label all chat conversations as suspicious if they involve at least one predator (SCI task). 

For these suspicious conversations, separate and label the interventions as predator or victim messages (VFP task).


In [5]:
conversations = [] # List to hold conversations with labels
interventions = [] # List to hold interventions with labels

for convo in conversations_clean:
    convo_authors = convo["authors"]
    label = 1 if any(author in predator_ids_train for author in convo_authors) else 0

    # Conversations labelled
    conversations.append({
        "text": convo["text"],
        "label": label
    })

    # If the conversation has predators, label each intervention
    if label == 1:
        for author, msgs in convo["messages_by_author"].items():
            author_label = 1 if author in predator_ids_train else 0
            full_intervention = " ".join(msgs)
            # Interventions labelled
            interventions.append({
                "intervention": full_intervention,
                "label": author_label
            })

df_conversations = pd.DataFrame(conversations)
df_interventions = pd.DataFrame(interventions)

### 3. **Suspicious Conversations Identification (SCI) stage.**

In [7]:
# === DATOS ===
texts = df_conversations["text"]
labels = df_conversations["label"]

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, stratify=labels, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# === MODELO 1: TF-IDF + SVM ===
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm = SVC(kernel="linear", probability=True)
svm.fit(X_train_tfidf, y_train)
proba_svm = svm.predict_proba(X_test_tfidf)[:, 1]

# === MODELO 2: BGE BASE + MLP ===
bge = SentenceTransformer("./bge-base-en", trust_remote_code=True)
X_train_bge = bge.encode(X_train.tolist(), normalize_embeddings=True, convert_to_numpy=True)
X_test_bge = bge.encode(X_test.tolist(), normalize_embeddings=True, convert_to_numpy=True)


mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=200)
mlp.fit(X_train_bge, y_train)
proba_mlp = mlp.predict_proba(X_test_bge)[:, 1]

# === MODELO 3: BERT FINE-TUNED ===
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Cargar desde local
bert_model = BertForSequenceClassification.from_pretrained("./bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("./bert-base-uncased")

train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer)


training_args = TrainingArguments(
    output_dir="./bert_out",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

trainer.train()

# Inferencia
bert_model.eval()
with torch.no_grad():
    inputs = tokenizer(X_test, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = bert_model(**inputs)
    proba_bert = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()

# === ENSAMBLADO FINAL ===
# Ajusta pesos si lo deseas
final_proba = 0.3 * proba_svm + 0.3 * proba_mlp + 0.4 * proba_bert
y_pred = (final_proba > 0.5).astype(int)

# === RESULTADOS ===
print(classification_report(y_test, y_pred))


RuntimeError: CUDA error: uncorrectable ECC error encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### 4. **Victim From Predator disclosure (VFP) stage.**

### 5. **Evaluate test data**


In [None]:
with open("pan12-sexual-predator-identification-groundtruth-problem1.txt") as f:
    predator_ids_test = set(f.read().splitlines())

tree_test = ET.parse("pan12-sexual-predator-identification-test-corpus-2012-05-17.xml")
root_test = tree_test.getroot()

In [None]:
class PredatorDetectionPipeline:
    def __init__(self, conversation_model, author_model):
        self.conversation_model = conversation_model
        self.author_model = author_model
        self.junk_pattern = re.compile(r"[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑüÜ\s.,!?*\'\"@():;<>\/\-]+")

    def _filter(self, root):
        filtered_conversations = []
        for conversation in root.findall("conversation"):
            authors = defaultdict(list)
            all_texts = []

            for message in conversation.findall("message"):
                author_el = message.find("author")
                text_el = message.find("text")

                if author_el is None or author_el.text is None:
                    continue
                if text_el is None or text_el.text is None:
                    continue

                author_id = author_el.text.strip()
                text = html.unescape(text_el.text.strip())

                if len(text) > 20 and self.junk_pattern.search(text):
                    continue

                authors[author_id].append(text)
                all_texts.append(text)

            if len(authors) <= 1:
                continue
            if any(len(msgs) < 6 for msgs in authors.values()):
                continue

            filtered_conversations.append({
                "conversation_id": conversation.get("id"),
                "authors": list(authors.keys()),
                "text": " ".join(all_texts),
                "messages_by_author": dict(authors),
                "xml": conversation
            })

        return filtered_conversations

    def _true_labels(self, conversations, predator_ids):
        true_labels = []
        for convo in conversations:
            convo_authors = convo["authors"]
            predator_in_convo = [author for author in convo_authors if author in predator_ids]

            if predator_in_convo:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": True,
                    "predator": predator_in_convo[0]
                })
            else:
                true_labels.append({
                    "conversation_id": convo["conversation_id"],
                    "suspicious": False,
                    "predator": None
                })

        return true_labels

    def _predict(self, conversation_data):
        text = conversation_data["text"]
        is_suspicious = self.conversation_model.predict([text])[0]
        if not is_suspicious:
            return {
                "conversation_id": conversation_data["conversation_id"],
                "suspicious": False,
                "predator": None
            }

        author_probs = []
        for author_id, msgs in conversation_data["messages_by_author"].items():
            joined_msgs = " ".join(msgs)
            score = self.author_model.predict_proba([joined_msgs])[0][1]
            author_probs.append((author_id, score))

        predator_id = max(author_probs, key=lambda x: x[1])[0]

        return {
            "conversation_id": conversation_data["conversation_id"],
            "suspicious": True,
            "predator": predator_id
        }

    def run_pipeline(self, root, predator_ids):
        # 1. Filter conversations
        conversations = self._filter(root)

        # 2. Generate true labels
        true_labels = self._true_labels(conversations, predator_ids)

        # 3. Predict with the models
        predictions = []
        for convo in conversations:
            resultado = self._predict(convo)
            predictions.append(resultado)

        # 4. Compare true labels vs. predictions
        y_true = [et["suspicious"] for et in true_labels]
        y_pred = [pr["suspicious"] for pr in predictions]

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("Model evaluation:")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1 Score : {f1:.4f}")

        return {
            "conversation_metrics": {
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1
            },
            "predictions": predictions,
            "true_labels": true_labels
        }