In [1]:
import json
import logging
from pathlib import Path

import numpy as np
import polars as pl
from loguru import logger
from skl2onnx import convert_sklearn, to_onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from slop_pre_processing import TfidfVectorizer, VectorizerParams, __version__

logging.basicConfig(level=logging.DEBUG)  # Or INFO, WARNING, etc.

print(__version__)

# vectorizer = TfidfVectorizer(ngram_range=(3, 5), min_df=1)
# print(vectorizer)
# sparse = vectorizer.fit_transform(
#     ["this is a sample", "this is another example sample"],
# )
# print(vectorizer)

0.1.0


In [2]:
df = pl.scan_csv("../data/raw/train_v2_drcat_02.csv").unique(["text"], maintain_order=True).collect()
df

text,label,prompt_name,source,RDizzl3_seven
str,i64,str,str,bool
"""Phones Modern humans today ar…",0,"""Phones and driving""","""persuade_corpus""",false
"""This essay will explain if dri…",0,"""Phones and driving""","""persuade_corpus""",false
"""Driving while the use of cellu…",0,"""Phones and driving""","""persuade_corpus""",false
"""Phones & Driving Drivers shou…",0,"""Phones and driving""","""persuade_corpus""",false
"""Cell Phone Operation While Dri…",0,"""Phones and driving""","""persuade_corpus""",false
…,…,…,…,…
"""Dear Senator, I am writing to…",1,"""Does the electoral college wor…","""kingki19_palm""",true
"""Dear Senator, I am writing to…",1,"""Does the electoral college wor…","""kingki19_palm""",true
"""Dear Senator, I am writing to…",1,"""Does the electoral college wor…","""kingki19_palm""",true
"""Dear Senator, I am writing to…",1,"""Does the electoral college wor…","""kingki19_palm""",true


In [3]:
logger.info(f"Loaded {len(df)} samples")
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    df["text"].to_numpy(),
    df["label"].to_numpy(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)
logger.info(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")


[32m2025-11-24 21:53:36.908[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoaded 44868 samples[0m
[32m2025-11-24 21:53:36.944[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTraining samples: 35894, Validation samples: 8974[0m


In [4]:
X_val[0]

"Advice about receiving advice\n\nDoing something one way and then realizing there was a great amount of other options is a very annoying (and unfortunately, very common) case. It's even more annoying when you don't realize what the other solution was because then you can never grow as a person. People should ask more than one person for advice when seeking it because not everything will work for everyone and it's always better to hear from different sources.\n\nNot everything will work for everyone, as everybody has their own way of dealing with certain things. Say there's a student in algebra honors and they are studying for a test that is stressing them out very much. Now, this student has trouble paying attention in class, and as a result of that, they often don't have the best of notes. If the student's parent is giving them advice and the parent says to the student that they should check their notes, the student will most likely have to seek help elsewhere because that advice is 

In [5]:
# Use Rust preprocessing (via Python bindings)
logger.info("Fitting Rust TF-IDF vectorizer...")
vectorizer = TfidfVectorizer((3, 5), min_df=10)
from scipy.sparse import csr_array
# fit_transform returns scipy.sparse.csr_matrix
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

[32m2025-11-24 21:53:36.955[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mFitting Rust TF-IDF vectorizer...[0m
DEBUG:slop_pre_processing.pre_processor.vectorizer.tfidf_vectorizer:Fitting TfidfVectorizer num_texts=35894
DEBUG:slop_pre_processing.pre_processor.vectorizer.count_vectorizer:Optimized fit_transform: tokenizing and computing n-grams once num_texts=35894
DEBUG:slop_pre_processing.pre_processor.vectorizer.tokenizer:Using parallel tokenization num_texts=35894
DEBUG:slop_pre_processing.pre_processor.vectorizer.count_vectorizer:Computing n-grams for all documents
DEBUG:slop_pre_processing.pre_processor.vectorizer.count_vectorizer:Fitting vectorizer from cached n-grams
DEBUG:slop_pre_processing.pre_processor.vectorizer.count_vectorizer:Building vocabulary from tokenized texts
DEBUG:slop_pre_processing.pre_processor.vectorizer.count_vectorizer:Using pre-computed n-grams for vocabulary building
DEBUG:slop_pre_processing.pre_processor.vectorizer.count

In [6]:
print(X_train_tfidf.shape)  # Should be (num_samples, 380032)

(35894, 380032)


In [7]:
print(type(X_train_tfidf))
print(type(X_val_tfidf))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


In [8]:
X_train_tfidf = csr_array(X_train_tfidf)
X_val_tfidf = csr_array(X_val_tfidf)

In [9]:
logger.info(f"Feature matrix: {X_train_tfidf.shape}")
logger.info(f"Sparsity: {100 * (1 - X_train_tfidf.nnz / np.prod(X_train_tfidf.shape)):.2f}%")


[32m2025-11-24 21:54:01.170[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mFeature matrix: (35894, 380032)[0m
[32m2025-11-24 21:54:01.171[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSparsity: 99.90%[0m


In [10]:
# Train ensemble
logger.info("Training ensemble...")
# Recreate ensemble with flatten_transform=False
nb = MultinomialNB(alpha=0.02)
sgd = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", random_state=42)

ensemble = VotingClassifier(
    estimators=[("nb", nb), ("sgd", sgd)],
    weights=[0.4, 0.6],
    voting="soft",
    n_jobs=-1,
    flatten_transform=False
)
# Retrain
ensemble.fit(X_train_tfidf, y_train)
ensemble.weights = np.array(ensemble.weights)  # Convert to numpy array


[32m2025-11-24 21:54:01.177[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTraining ensemble...[0m


In [11]:
# Evaluate
val_preds = ensemble.predict_proba(X_val_tfidf)[:, 1]
val_auc = roc_auc_score(y_val, val_preds)
logger.info(f"Validation AUC: {val_auc:.4f}")


[32m2025-11-24 21:54:02.936[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mValidation AUC: 0.9997[0m


In [12]:
y_val

array([0, 0, 0, ..., 0, 1, 1], shape=(8974,))

In [13]:
# Confusion matrix
threshold = 0.5
y_pred_labels = (val_preds >= threshold).astype(float)

tn, fp, fn, tp = confusion_matrix(y_val, y_pred_labels).ravel()

# Metrics
accuracy = accuracy_score(y_val, y_pred_labels)
precision = precision_score(y_val, y_pred_labels, zero_division=0)
recall = recall_score(y_val, y_pred_labels, zero_division=0)
f1 = f1_score(y_val, y_pred_labels, zero_division=0)
print(f"\nMetrics (threshold={threshold:.2f}):")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print("\nConfusion Matrix:")
print("              Predicted")
print("              0      1")
print(f"Actual  0    {tn:5d}  {fp:5d}")
print(f"        1    {fn:5d}  {tp:5d}")


Metrics (threshold=0.50):
  Accuracy:  0.9936
  Precision: 0.9980
  Recall:    0.9857
  F1 Score:  0.9918
  TP: 3450, FP: 7, TN: 5467, FN: 50

Confusion Matrix:
              Predicted
              0      1
Actual  0     5467      7
        1       50   3450


In [14]:
X_train_tfidf

<Compressed Sparse Row sparse array of dtype 'float64'
	with 13346537 stored elements and shape (35894, 380032)>

In [15]:
from skl2onnx import to_onnx
output_dir = Path("../model_artifacts")
output_dir.mkdir(exist_ok=True)

# Save vectorizer in both formats:
# 1. JSON-wrapped format for Python (with metadata)
vectorizer.save(output_dir / "tfidf_vectorizer.json")

# 2. Raw bincode format for Rust (no JSON wrapper)
vectorizer.save_raw_bincode(output_dir / "tfidf_vectorizer.bin")

# Convert to ONNX
# Disable ZipMap to output probabilities as a 2D tensor [batch_size, num_classes]
onx = to_onnx(
    ensemble,
    X_train_tfidf[:1].astype(np.float32).toarray(),  # Sample for shape inference
    target_opset=15,
    options={
        type(ensemble): {'zipmap': False}  # Output probabilities as tensor, not dict
    }
)
model_name = output_dir / "slop-classifier.onnx"
with model_name.open("wb") as f:
    f.write(onx.SerializeToString())

print(f"Model saved! Input shape: [batch_size, {X_train_tfidf.shape[1]}]")
print(f"Output: probabilities [batch_size, 2]")

DEBUG:skl2onnx:[Var] +Variable('X', 'X', type=FloatTensorType(shape=[None, 380032]))
DEBUG:skl2onnx:[Var] update is_root=True for Variable('X', 'X', type=FloatTensorType(shape=[None, 380032]))
DEBUG:skl2onnx:[parsing] found alias='SklearnVotingClassifier' for type=<class 'sklearn.ensemble._voting.VotingClassifier'>.
DEBUG:skl2onnx:[Op] +Operator(type='SklearnVotingClassifier', onnx_name='SklearnVotingClassifier', inputs='', outputs='', raw_operator=VotingClassifier(estimators=[('nb',MultinomialNB(alpha=0.02)),('sgd',SGDClassifier(loss='modified_huber',max_iter=8000,random_state=42,tol=0.0001))],flatten_transform=False,n_jobs=-1,voting='soft',weights=array([0.4,0.6])))
DEBUG:skl2onnx:[Op] add In Variable('X', 'X', type=FloatTensorType(shape=[None, 380032])) to Operator(type='SklearnVotingClassifier', onnx_name='SklearnVotingClassifier', inputs='X', outputs='', raw_operator=VotingClassifier(estimators=[('nb',MultinomialNB(alpha=0.02)),('sgd',SGDClassifier(loss='modified_huber',max_iter=8

Model saved! Input shape: [batch_size, 380032]
Output: probabilities [batch_size, 2]


In [16]:
import onnx
onnx_model = onnx.load(model_name)
onnx.checker.check_model(onnx_model)

In [17]:

import numpy
import onnxruntime as rt

sess = rt.InferenceSession(model_name)


2025-11-24 21:54:03.451 python[61784:8363778] 2025-11-24 21:54:03.449128 [W:onnxruntime:, graph.cc:4885 CleanUnusedInitializersAndNodeArgs] Removing initializer 'classes_ind'. It is not used by any node and should be removed from the model.


In [18]:
input_name = sess.get_inputs()[0].name
input_name

'X'

In [19]:
X_train_tfidf[:1]

<Compressed Sparse Row sparse array of dtype 'float64'
	with 342 stored elements and shape (1, 380032)>

In [20]:
test_input = X_train_tfidf[:2]#.todense()
test_input

<Compressed Sparse Row sparse array of dtype 'float64'
	with 694 stored elements and shape (2, 380032)>

In [21]:
ensemble.predict(test_input)

array([1, 0])

In [24]:
input_name = sess.get_inputs()[0].name
# pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0]

pred_onx = sess.run(None, {input_name: X_train_tfidf[:2].astype(np.float32).toarray()})
print(pred_onx)

[array([1, 0], dtype=int64), array([[9.1592313e-17, 1.0000000e+00],
       [9.7694218e-01, 2.3057826e-02]], dtype=float32)]


In [23]:
# Inference