# Objetivos
- Reducir falsos positivos en perfiles con transacciones complejas
- Evitar alertas innecesarias en clientes con comportamiento legítimo pero no convencional.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# EDA

In [None]:
df = pd.read_csv("feature_engineering_work/dataset_feature_engineering.csv")
df

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Distribución
fraud_ratio = df['is_fraud'].value_counts(normalize=True)
print("Distribución de fraudes:")
print(fraud_ratio)

sns.countplot(x='is_fraud', data=df)
plt.title('Distribución de Fraudes vs No Fraudes')
plt.show()


In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).drop(columns=['is_fraud'])
corr = numeric_cols.corrwith(df['is_fraud']).sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=corr.values, y=corr.index)
plt.title("Correlación de variables numéricas con `is_fraud`")
plt.show()


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

sns.countplot(x='hour', hue='is_fraud', data=df, ax=ax[0])
ax[0].set_title('Transacciones por Hora')

sns.countplot(x='trans_day', hue='is_fraud', data=df, ax=ax[1])
ax[1].set_title('Transacciones por Día del Mes')

plt.tight_layout()
plt.show()


In [None]:
top_merchants = df['merchant'].value_counts().head(10).index
df_top = df[df['merchant'].isin(top_merchants)]

sns.countplot(y='merchant', hue='is_fraud', data=df_top)
plt.title("Fraudes en Comercios Más Frecuentes")
plt.show()

sns.countplot(y='category', hue='is_fraud', data=df)
plt.title("Fraudes por Categoría de Compra")
plt.show()


In [None]:
# 1. FLAG DE PERFIL COMPLEJO
df['is_complex_profile'] = (
    (df['amt_year'] > 5000) & # El usuario ha gastado más de Q5000 en el año.
    (df['times_shopped_at_merchant_year'] > 10) & # El usuario ha comprado en un mismo comercio más de 10 veces en el año.
    (df['count_month_shopping_net'] > 5) # Ha realizado más de 5 compras online en el mes.
)

In [None]:
# Transacciones complejas legítimas
non_fraud_complex = df[(df['is_complex_profile']) & (df['is_fraud'] == 0)]
print(f"Transacciones complejas legítimas: {len(non_fraud_complex)}")


In [None]:
# Tasa de fraude en perfiles complejos
print("\nTasa de fraude en perfiles complejos:")
print(df[df['is_complex_profile']]['is_fraud'].value_counts(normalize=True))

In [None]:
# Proporción de fraude por categoría
fraude_por_cat = df.groupby('category')['is_fraud'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
fraude_por_cat.plot(kind='barh', title='Proporción de fraude por categoría')
plt.tight_layout()
plt.show()

In [None]:
# Porcentaje de fraudes por hora
fraude_por_hora = df.groupby('hour')['is_fraud'].mean()
plt.figure(figsize=(10, 4))
fraude_por_hora.plot(kind='bar', title='% de fraudes por hora')
plt.ylabel('% fraude')
plt.xlabel('Hora del día')
plt.tight_layout()
plt.show()

In [None]:
# Proyección PCA
numeric_cols = df.select_dtypes(include=['float64', 'int64']).drop(columns=['is_fraud'])

X_scaled = StandardScaler().fit_transform(numeric_cols.fillna(0))
pca = PCA(n_components=2).fit_transform(X_scaled)

df['pca1'] = pca[:, 0]
df['pca2'] = pca[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df.sample(5000, random_state=42), x='pca1', y='pca2', hue='is_fraud',
                style='is_complex_profile', alpha=0.4)
plt.title("Distribución PCA: fraudes y perfiles complejos")
plt.tight_layout()
plt.show()

# FIN EDA

In [None]:
# 2. CONVERTIR CATEGÓRICAS, da error si no son categoricas
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

In [None]:
# 3. DIVISIÓN TEMPORAL
train_df = df[df['trans_month'] < 12].copy()
test_df  = df[df['trans_month'] == 12].copy()

X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']
X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']

In [None]:
# 4. FUNCIÓN DE EVALUACIÓN
def feval_penalty_fp_complex(preds, train_data):
    labels = train_data.get_label()
    preds_binary = (preds > 0.5).astype(int)
    complexity = train_data.get_weight()
    if complexity is None:
        complexity = np.zeros_like(labels)

    tp = ((labels == 1) & (preds_binary == 1)).sum()
    fp = ((labels == 0) & (preds_binary == 1)).astype(int)
    fp_penalized = (fp * (1 + complexity)).sum()

    score = tp / (tp + fp_penalized + 1e-6)
    return 'tp_over_penalized_fp', score, True

# F1 score penalizado por FP complejos
def feval_f1_fp_penalty(preds, train_data):
    labels = train_data.get_label()
    preds_binary = (preds > 0.5).astype(int)
    complexity = train_data.get_weight()
    if complexity is None:
        complexity = np.zeros_like(labels)

    tp = ((labels == 1) & (preds_binary == 1)).sum()
    fp = ((labels == 0) & (preds_binary == 1)).sum()
    fn = ((labels == 1) & (preds_binary == 0)).sum()

    precision = tp / (tp + fp + 1e-6)
    recall = tp / (tp + fn + 1e-6)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)

    fp_penalty = ((labels == 0) & (preds_binary == 1) & (complexity == 1)).sum()
    penalty_factor = fp_penalty / (tp + 1e-6)

    return 'f1_penalized', f1 - 0.1 * penalty_factor, True

# Precisión pura, penalizada por FP complejos
def feval_precision_boosted(preds, train_data):
    labels = train_data.get_label()
    preds_binary = (preds > 0.5).astype(int)
    complexity = train_data.get_weight()
    if complexity is None:
        complexity = np.zeros_like(labels)

    tp = ((labels == 1) & (preds_binary == 1)).sum()
    fp = ((labels == 0) & (preds_binary == 1)).sum()
    precision = tp / (tp + fp + 1e-6)

    fp_complex = ((labels == 0) & (preds_binary == 1) & (complexity == 1)).sum()
    penalty = 0.05 * fp_complex

    return 'precision_boosted', precision - penalty / (tp + 1e-6), True


def compute_scale_pos_weight(y, cap=300):
    """
    Calcula el scale_pos_weight dinámicamente y lo limita a un máximo razonable.
    
    Args:
        y (array-like): Vector de etiquetas (0 y 1).
        cap (float): Valor máximo permitido para evitar sobreajuste.

    Returns:
        float: Valor final a usar en scale_pos_weight.
    """
    count_neg = np.sum(np.array(y) == 0)
    count_pos = np.sum(np.array(y) == 1)
    if count_pos == 0:
        raise ValueError("No hay muestras positivas (fraudes) en los datos.")
    weight = count_neg / count_pos
    capped_weight = min(weight, cap)
    print(f"scale_pos_weight calculado: {weight:.2f} → usado: {capped_weight:.2f}")
    return capped_weight


In [None]:
# 5. PREPARACIÓN DE DATOS PARA LIGHTGBM
# Guardamos la columna de pesos ANTES de eliminarla
weights = (~X_train['is_complex_profile']).astype(int) * 2 + 1

# Eliminamos la columna de perfil complejo del modelo
X_train_model = X_train.drop(columns=['is_complex_profile'])
X_test_model = X_test.drop(columns=['is_complex_profile'])

train_set = lgb.Dataset(X_train_model, label=y_train, weight=weights)
valid_set = lgb.Dataset(X_test_model, label=y_test)

fraud_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
print(f"scale_pos_weight calculado: {fraud_weight:.2f}")

scaled_weight = compute_scale_pos_weight(y_train, cap=300)

In [None]:
# 6. PARÁMETROS DEL MODELO
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    # 'is_unbalance': True,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'scale_pos_weight': scaled_weight,  # compensar desbalance
    'seed': 42
}

In [None]:
# 7. ENTRENAMIENTO
model = lgb.train(
    params,
    train_set,
    valid_sets=[valid_set],
    num_boost_round=500,
    feval=feval_penalty_fp_complex,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(50)
    ]
)

In [None]:
# 8. EVALUACIÓN FINAL
# Umbral personalizado
def custom_threshold(preds_proba, is_complex, threshold_simple=0.5, threshold_complex=0.7):
    return np.where(
        is_complex,
        preds_proba > threshold_complex,
        preds_proba > threshold_simple
    ).astype(int)

preds_proba = model.predict(X_test_model)
preds_bin = custom_threshold(preds_proba, X_test['is_complex_profile'])

# preds_bin = (preds_proba > 0.5).astype(int)

print("Matriz de Confusión:")
print(confusion_matrix(y_test, preds_bin))

print("\nClassification Report:")
print(classification_report(y_test, preds_bin, digits=4))

In [None]:
# 9. Métrica personalizada para FP en perfiles complejos
def report_fp_complex(y_true, y_pred, is_complex):
    false_positives = (y_true == 0) & (y_pred == 1)
    fp_complex = false_positives & is_complex

    print(f"Falsos Positivos totales: {false_positives.sum()}")
    print(f"Falsos Positivos en perfiles complejos: {fp_complex.sum()}")
    print(f"Proporción de FP complejos: {fp_complex.sum() / (false_positives.sum() + 1e-6):.4f}")
report_fp_complex(y_test.values, preds_bin, X_test['is_complex_profile'].values)

In [None]:
# 10. IMPORTANCIA DE VARIABLES
lgb.plot_importance(model, max_num_features=15, importance_type='gain')
plt.title("Top 15 features por ganancia")
plt.tight_layout()
plt.show()

In [None]:
# 11. COMPARACIÓN AUTOMÁTICA DE LAS 3 FUNCIONES
results = {}

for name, feval_fn in [
    ("penalty_fp_complex", feval_penalty_fp_complex),
    ("f1_fp_penalty", feval_f1_fp_penalty),
    ("precision_boosted", feval_precision_boosted),
]:
    print(f"\nEntrenando con métrica: {name}")
    
    model = lgb.train(
        params,
        train_set,
        valid_sets=[valid_set],
        num_boost_round=500,
        feval=feval_fn,
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(50)
        ]
    )

    preds_proba = model.predict(X_test_model)
    preds_bin = (preds_proba > 0.5).astype(int)

    print(f"\nResultados para {name}:")
    print(confusion_matrix(y_test, preds_bin))
    print(classification_report(y_test, preds_bin, digits=4))

    results[name] = {
        "model": model,
        "preds_bin": preds_bin,
        "preds_proba": preds_proba
    }

In [None]:
y_pred_bin = results['penalty_fp_complex']['preds_bin']  # o cualquier otro
is_complex = X_test['is_complex_profile']

false_positives = (y_test == 0) & (y_pred_bin == 1)
print(f"FP complejos: {false_positives[is_complex].sum()} / {false_positives.sum()}")