In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

import tensorflow as tf
from tensorflow.keras import layers, Model

from transformers import AutoTokenizer

from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, roc_auc_score
)

In [None]:
X_train_text = df_train["clean_body"].fillna("").astype(str).tolist()
y_train      = df_train["rule_violation"].astype(int).values

X_val_text   = df_val["clean_body"].fillna("").astype(str).tolist()
y_val        = df_val["rule_violation"].astype(int).values  

NameError: name 'df_train' is not defined

In [None]:
MODEL_NAME = "bert-base-uncased"  # solo tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(texts, max_len=128):
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="np"
    )

X_train_tok = tokenize(X_train_text)
X_val_tok   = tokenize(X_val_text)

In [None]:
def attention_layer(inputs):
    # inputs: (batch, seq_len, features)
    score = layers.Dense(1, activation="tanh")(inputs)
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * inputs
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector

max_len = 128
vocab_size = tokenizer.vocab_size

# Model architecture
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
mask      = layers.Input(shape=(max_len,), dtype=tf.int32)

x = layers.Embedding(
    input_dim=vocab_size,
    output_dim=128,
    mask_zero=True
)(input_ids)

x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

x = attention_layer(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.2)(x)

output = layers.Dense(1, activation="sigmoid")(x)

model = Model(inputs=[input_ids, mask], outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-4),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=3,
        restore_best_weights=True
    )
]

history = model.fit(
    X_train_tok,
    y_train,
    validation_data=(X_val_tok, y_val),
    epochs=12,
    batch_size=32,
    callbacks=callbacks
)

              precision    recall  f1-score   support

           0       0.81      0.67      0.73       200
           1       0.72      0.84      0.78       206

    accuracy                           0.76       406
   macro avg       0.77      0.76      0.76       406
weighted avg       0.77      0.76      0.76       406

F1: 0.7802690582959642


In [None]:
y_pred_prob = model.predict(X_val_tok).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

acc = accuracy_score(y_val, y_pred)
f1  = f1_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
cm = confusion_matrix(y_val, y_pred).tolist()

try:
    auc = roc_auc_score(y_val, y_pred_prob)
except:
    auc = None

print("\n=== METRICS ===")
print("Acc:", acc)
print("F1:", f1)
print("Precision:", prec)
print("Recall:", rec)
print("AUC:", auc)
print("Confusion matrix:", cm)

Saved sklearn pipeline to ..\models\linear_svc_pipeline.joblib
Metrics -- acc: 0.7586, f1: 0.7803, prec: 0.7250, rec: 0.8447, auc: 0.8179247572815534
Export failed: No module named 'utils'
Wrote metadata


In [None]:

models_dir = Path("../models_bilstm")
models_dir.mkdir(exist_ok=True, parents=True)

# Save model
model.save(models_dir / "bilstm_attention_model")

# Save tokenizer
tokenizer.save_pretrained(models_dir / "tokenizer")

# Save metrics metadata
meta = {
    "name": "BiLSTM_Attention",
    "type": "tensorflow",
    "path": "bilstm_attention_model",
    "description": "Neural network with BiLSTM + Attention + EarlyStopping",
    "metrics": {
        "accuracy": float(acc),
        "f1": float(f1),
        "precision": float(prec),
        "recall": float(rec),
        "auc": float(auc) if auc else None,
        "confusion_matrix": cm,
        "n_val": len(y_val)
    }
}

with open(models_dir / "metadata.json", "w", encoding="utf8") as f:
    json.dump(meta, f, indent=2)

print("Model + tokenizer + metadata saved.")