In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.svm import LinearSVC

In [15]:
X_train_text = df_train["clean_body"].fillna("").astype(str)
y_train      = df_train["rule_violation"]

X_val_text   = df_val["clean_body"].fillna("").astype(str)
y_val        = df_val["rule_violation"]


In [16]:
docs_train = X_train_text.tolist()
docs_val   = X_val_text.tolist()

In [17]:
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)

X_train_tfidf = vectorizer.fit_transform(docs_train)
X_val_tfidf   = vectorizer.transform(docs_val)

In [18]:
clf = LinearSVC(
    C=0.7,
    class_weight=None,  
    max_iter=5000
)

clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_val_tfidf)

print(classification_report(y_val, y_pred))
print("F1:", f1_score(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.67      0.73       200
           1       0.72      0.84      0.78       206

    accuracy                           0.76       406
   macro avg       0.77      0.76      0.76       406
weighted avg       0.77      0.76      0.76       406

F1: 0.7802690582959642


In [None]:
# === Guardar pipeline TF-IDF + LinearSVC y exportar bundle JSONs===
from sklearn.pipeline import Pipeline
import joblib
from pathlib import Path
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import numpy as np

models_dir = Path('../models')
models_dir.mkdir(parents=True, exist_ok=True)

pipe = Pipeline([('tfidf', vectorizer), ('clf', clf)])
joblib.dump(pipe, models_dir / 'linear_svc_pipeline.joblib')
print('Saved sklearn pipeline to', models_dir / 'linear_svc_pipeline.joblib')

# Calcular métricas en validación
try:
    y_val_arr = np.array(y_val)
    y_pred_val = clf.predict(X_val_tfidf)
    acc = float(accuracy_score(y_val_arr, y_pred_val))
    f1 = float(f1_score(y_val_arr, y_pred_val))
    prec = float(precision_score(y_val_arr, y_pred_val))
    rec = float(recall_score(y_val_arr, y_pred_val))
    cm = confusion_matrix(y_val_arr, y_pred_val).tolist()
    auc = None
    try:
        if hasattr(clf, 'decision_function'):
            scores = clf.decision_function(X_val_tfidf)
            auc = float(roc_auc_score(y_val_arr, scores))
    except Exception:
        auc = None
    print(f"Metrics -- acc: {acc:.4f}, f1: {f1:.4f}, prec: {prec:.4f}, rec: {rec:.4f}, auc: {auc}")
except Exception as e:
    print('Failed to compute metrics:', e)
    acc = f1 = prec = rec = auc = None
    cm = []

# Intentar exportar a JSON bundle (frontend-friendly)
try:
    from utils.export_sklearn_to_json import export
    export(models_dir / 'linear_svc_pipeline.joblib')
    print('Exported bundle JSON')
except Exception as e:
    print('Export failed:', e)

# Guardar metadata con métricas
meta = {
    'name': 'LinearSVC_TFIDF',
    'type': 'sklearn',
    'path': 'linear_svc_pipeline.joblib',
    'description': 'LinearSVC con TF-IDF',
    'metrics': {
        'f1': f1,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'auc': auc,
        'confusion_matrix': cm,
        'n_val': int(len(y_val)) if hasattr(y_val, '__len__') else None
    }
}
with open(models_dir / 'linear_svc_pipeline_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print('Wrote metadata')


Saved sklearn pipeline to ..\models\linear_svc_pipeline.joblib
Metrics -- acc: 0.7586, f1: 0.7803, prec: 0.7250, rec: 0.8447, auc: 0.8179247572815534
Export failed: No module named 'utils'
Wrote metadata
