In [1]:
"""
Script respons√°vel por gerar os artefatos oficiais do modelo v1.
Este script deve ser executado apenas uma vez para congelamento da vers√£o.
"""

'\nScript respons√°vel por gerar os artefatos oficiais do modelo v1.\nEste script deve ser executado apenas uma vez para congelamento da vers√£o.\n'

In [2]:
# Importa√ß√µes
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, roc_curve, f1_score, 
    precision_score, recall_score
)
import warnings
warnings.filterwarnings('ignore')

In [3]:
os.chdir('src')
ARTIFACT_PATH = "../artifacts/model_v1/"
from data.load_data import carregar_dados
df = carregar_dados("../data/raw/v1/bs140513_032310.csv")

üîÑ Carregando dados...
   Shape original: (594643, 10)
‚úÖ Dados limpos: (594643, 10)
   Normal: 587,443 | Fraude: 7,200


In [4]:
# Distribui√ß√£o da vari√°vel target
fraud_counts = df['fraud'].value_counts().sort_index()
fraud_pct = df['fraud'].value_counts(normalize=True).sort_index() * 100

for value in sorted(df['fraud'].unique()):
    count = fraud_counts[value]
    pct = fraud_pct[value]
    label = 'Normal' if value == 0 else 'Fraude'

In [5]:
# Criando uma c√≥pia do dataframe para feature engineering
df_features = df.copy()

In [6]:
# ============================================================================
# 1. FEATURES BASEADAS NO CLIENTE
# ============================================================================

# Frequ√™ncia de transa√ß√µes por cliente no mesmo step
freq_step = (
    df_features.groupby(['step', 'customer'])
    .size()
    .reset_index(name='qtd_transacoes')
)
df_features = df_features.merge(freq_step, on=['step', 'customer'])
df_features['alert_freq'] = (df_features['qtd_transacoes'] > 3).astype(int)

# Perfil de valor por cliente (m√©dia e desvio padr√£o)
stats_cliente = (
    df_features.groupby('customer')['amount']
    .agg(['mean', 'std'])
    .reset_index()
)
stats_cliente.columns = ['customer', 'amount_mean_cliente', 'amount_std_cliente']
df_features = df_features.merge(stats_cliente, on='customer')
df_features['amount_std_cliente'].fillna(0, inplace=True)
df_features['alert_valor'] = (
    df_features['amount'] > (df_features['amount_mean_cliente'] + 3 * df_features['amount_std_cliente'])
).astype(int)

# Valor relativo √† m√©dia do cliente
df_features['valor_relativo_cliente'] = df_features['amount'] / (df_features['amount_mean_cliente'] + 1e-6)

# Total de transa√ß√µes por cliente
df_features['total_tx_cliente'] = (
    df_features.groupby('customer')['amount']
    .transform('count')
)

# Volume total gasto pelo cliente
df_features['volume_total_cliente'] = (
    df_features.groupby('customer')['amount']
    .transform('sum')
)

# Diversidade de categorias por cliente
df_features['num_categorias_cliente'] = (
    df_features.groupby('customer')['category']
    .transform('nunique')
)

# Diversidade de merchants por cliente
df_features['num_merchants_cliente'] = (
    df_features.groupby('customer')['merchant']
    .transform('nunique')
)

In [7]:
# ============================================================================
# 2. FEATURES TEMPORAIS
# ============================================================================

# Ordenando por cliente e step
df_features = df_features.sort_values(['customer', 'step']).reset_index(drop=True)

# Transa√ß√µes nos √∫ltimos 5 steps
df_features['tx_ultimos_5_steps'] = (
    df_features.groupby('customer')['step']
    .transform(lambda x: x.rolling(5, min_periods=1).count())
)

# Tempo desde a √∫ltima transa√ß√£o
df_features['step_diff'] = (
    df_features.groupby('customer')['step']
    .diff()
    .fillna(0)
)

# M√©dia de valor dos √∫ltimos 5 steps
df_features['amount_media_5steps'] = (
    df_features.groupby('customer')['amount']
    .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

# Desvio do valor atual em rela√ß√£o aos √∫ltimos 5 steps
df_features['amount_desvio_5steps'] = (
    df_features['amount'] - df_features['amount_media_5steps']
)

In [8]:
# ============================================================================
# 4. FEATURES DE RELACIONAMENTO CLIENTE-MERCHANT
# ============================================================================

# Frequ√™ncia do par cliente-merchant
df_features['tx_cliente_merchant'] = (
    df_features.groupby(['customer', 'merchant'])['amount']
    .transform('count')
)

# √â a primeira transa√ß√£o deste cliente com este merchant?
df_features['primeira_tx_merchant'] = (
    df_features['tx_cliente_merchant'] == 1
).astype(int)

# Propor√ß√£o de transa√ß√µes do cliente neste merchant
df_features['prop_tx_merchant'] = (
    df_features['tx_cliente_merchant'] / df_features['total_tx_cliente']
)

In [9]:
# ============================================================================
# 5. FEATURES DE LOCALIZA√á√ÉO
# ============================================================================


# Mesma localiza√ß√£o?
df_features['mesma_localizacao'] = (
    df_features['zipcodeOri'] == df_features['zipMerchant']
).astype(int)

# N√∫mero de diferentes localiza√ß√µes do cliente
df_features['num_zipcodes_cliente'] = (
    df_features.groupby('customer')['zipcodeOri']
    .transform('nunique')
)

In [10]:
# Encoding de vari√°veis categ√≥ricas
le_gender = LabelEncoder()
le_category = LabelEncoder()

df_features['gender_encoded'] = le_gender.fit_transform(df_features['gender'])
df_features['category_encoded'] = le_category.fit_transform(df_features['category'])

In [11]:
# Selecionando features para o modelo
features_to_use = [
    # Features originais
    'step', 'age', 'gender_encoded', 'category_encoded', 'amount',
    
    # Features engineered - Cliente
    'qtd_transacoes', 'alert_freq', 'alert_valor', 'valor_relativo_cliente',
    'total_tx_cliente', 'volume_total_cliente', 'num_categorias_cliente',
    'num_merchants_cliente', 'amount_mean_cliente', 'amount_std_cliente',
    
    # Features temporais
    'tx_ultimos_5_steps', 'step_diff', 'amount_media_5steps', 'amount_desvio_5steps',
    
    # Features merchant
    #'tx_por_merchant_train', 'fraude_merchant_train', 'amount_mean_merchant'
     'amount_std_merchant',
    
    # Features relacionamento
    'tx_cliente_merchant', 'primeira_tx_merchant', 'prop_tx_merchant',
    
    # Features localiza√ß√£o
    'mesma_localizacao', 'num_zipcodes_cliente',
    
    # Features categoria
    #'fraude_categoria'
     'amount_mean_categoria', 'amount_desvio_categoria',
    
    # Scores
    'qtd_alertas', 'score_regra'
]

# Verificar se todas as features existem
missing_features = [f for f in features_to_use if f not in df_features.columns]
if missing_features:
    features_to_use = [f for f in features_to_use if f in df_features.columns]

X = df_features[features_to_use].copy()
y = df_features['fraud'].copy()

# Garantir que X cont√©m apenas valores num√©ricos
X = X.apply(pd.to_numeric, errors='coerce')

# Substituir inf e -inf por NaN
X = X.replace([np.inf, -np.inf], np.nan)

# Preencher NaN com 0
X = X.fillna(0)

In [12]:
# Split treino/teste com estratifica√ß√£o
from models.v1.base_model import split_data

X_train, X_test, y_train, y_test = split_data(
    X, y, test_size=0.2, random_state=42
)

In [13]:
# Normaliza√ß√£o das features
from models.v1.scaler import scaler_data

X_train_scaled, X_test_scaled, scaler = scaler_data(X_train, X_test)

### Modelagem de Machine Learning¬∂

In [14]:
# Gradient Boosting
from models.v1.pipeline import pipeline

model_type = "gb"
gb_results = pipeline(X_train, X_test, y_train, y_test, model_type)


Treinando: Gradient Boosting

M√©tricas no conjunto de teste:
  Precision: 0.9180
  Recall:    0.8007
  F1-Score:  0.8553
  ROC-AUC:   0.9982
  Tempo (1000): 611.3882s

Matriz de Confus√£o:
[[117386    103]
 [   287   1153]]

Relat√≥rio de Classifica√ß√£o:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00    117489
      Fraude       0.92      0.80      0.86      1440

    accuracy                           1.00    118929
   macro avg       0.96      0.90      0.93    118929
weighted avg       1.00      1.00      1.00    118929

‚úÖ Fun√ß√£o de avalia√ß√£o criada!


In [15]:
# Save artifacts

joblib.dump(gb_results['model'], ARTIFACT_PATH + "model.pkl")
joblib.dump(scaler, ARTIFACT_PATH + "scaler.pkl")

['../artifacts/model_v1/scaler.pkl']