In [1]:
import json
import numpy as np
import pandas as pd
import joblib
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# ================================
# Caminhos
# ================================
json_path = r"C:\Users\Willgnner\Documents\TJ-GO\Perfil-Conciliação-Xgboost-GCP-V2\dados\cejusc_14052025\dados_processos_cejusc_14052025_V1_enriquecidos.json"
embedding_path = r"C:\Users\Willgnner\Documents\TJ-GO\Perfil-Conciliação-Xgboost-GCP-V2\dados\Embeddings\dados_processos_cejusc_14052025_V1_enriquecidos_Embeddings.npy"
modelo_saida = r"C:\Users\Willgnner\Documents\TJ-GO\Perfil-Conciliação-Xgboost-GCP-V2\Notebooks\CatBoost\CatBoost.pkl"

# ================================
# Carga dos dados
# ================================
with open(json_path, 'r', encoding='utf-8') as f:
    dados = json.load(f)

df = pd.DataFrame(dados)
embeddings = np.load(embedding_path)
df = df.iloc[:embeddings.shape[0]]

In [2]:
# ================================
# Engenharia de features
# ================================
prefixos = ("tem_", "n_", "tipo_", "cita_", "pede_", "houve_", "tentou_", "usa_", "problema_", "citou_")
colunas_struct = [c for c in df.columns if c.startswith(prefixos)]
cat_cols = [c for c in colunas_struct if df[c].dtype == 'object']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
colunas_final = [c for c in df.columns if c.startswith(prefixos)]
X_struct = df[colunas_final].fillna(0).astype(float).values

X = np.hstack((embeddings, X_struct))
y = df["classificacao"].map({'frutifero': 1, 'infrutifero': 0}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [3]:
# ================================
# Treinar modelo
# ================================
modelo = CatBoostClassifier(
    iterations=600,
    learning_rate=0.03,
    depth=5,
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)
modelo.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1bf9a497b60>

In [4]:
# ================================
# Melhor threshold
# ================================
y_proba = modelo.predict_proba(X_test)[:, 1]
thresholds = np.linspace(0.1, 0.9, 30)
f1_scores = [f1_score(y_test, (y_proba > t).astype(int)) for t in thresholds]
melhor_threshold = thresholds[np.argmax(f1_scores)]
y_pred = (y_proba > melhor_threshold).astype(int)

In [5]:
# ================================
# Avaliação
# ================================
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
total_erros = (y_test != y_pred).sum()

print("\nAvaliação - CatBoost")
print(f"  - Acurácia:      {acc:.4f}")
print(f"  - Precisão:      {prec:.4f}")
print(f"  - Recall:        {rec:.4f}")
print(f"  - F1-score:      {f1:.4f}")
print(f"  - Erros:         {total_erros} / {len(y_test)}")
print(f"  - Threshold ótimo: {melhor_threshold:.2f}")

joblib.dump(modelo, modelo_saida)


Avaliação - CatBoost
  - Acurácia:      0.5002
  - Precisão:      0.4626
  - Recall:        0.9046
  - F1-score:      0.6121
  - Erros:         1105 / 2211
  - Threshold ótimo: 0.35


['C:\\Users\\Willgnner\\Documents\\TJ-GO\\Perfil-Conciliação-Xgboost-GCP-V2\\Notebooks\\CatBoost\\CatBoost.pkl']