### Premisa
En este notebook se analizan los pasos seguidos para la creación del modelo final utilizado en la herramienta

---

Carga de librerias

In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd
import pickle
import gdown
from google.colab import files

Carga de datasets y label encoders

In [2]:
# Descargar archivos desde Google Drive
hi_small_id = '1fDw0plQy898cw5aPZ8qBfP61djO6ssDH'
low_small_id = '1LIGRW_huhOGhS1Xl1n-v39R7KweRYMny'
gdown.download(f'https://drive.google.com/uc?export=download&id={hi_small_id}', 'HI-Small_Trans.csv', quiet=False)
gdown.download(f'https://drive.google.com/uc?export=download&id={low_small_id}', 'LI-Small_Trans.csv', quiet=False)

# Cargar los datasets
hi_small = pd.read_csv('HI-Small_Trans.csv')
low_small = pd.read_csv('LI-Small_Trans.csv')

# Descargar los label encoders utilizando gdown
encoders_file_id = '1fi9D8-UYu541eiHGDWugnat1Vavd9a3h'
gdown.download(f'https://drive.google.com/uc?export=download&id={encoders_file_id}', 'label_encoders1.pkl', quiet=False)


Downloading...
From (original): https://drive.google.com/uc?export=download&id=1fDw0plQy898cw5aPZ8qBfP61djO6ssDH
From (redirected): https://drive.google.com/uc?export=download&id=1fDw0plQy898cw5aPZ8qBfP61djO6ssDH&confirm=t&uuid=2f3b8f87-80a3-4501-b001-8fd741e6590f
To: /content/HI-Small_Trans.csv
100%|██████████| 476M/476M [00:05<00:00, 87.3MB/s]
Downloading...
From (original): https://drive.google.com/uc?export=download&id=1LIGRW_huhOGhS1Xl1n-v39R7KweRYMny
From (redirected): https://drive.google.com/uc?export=download&id=1LIGRW_huhOGhS1Xl1n-v39R7KweRYMny&confirm=t&uuid=d0d0dcbf-867e-4dc4-9020-52f98e25d801
To: /content/LI-Small_Trans.csv
100%|██████████| 650M/650M [00:09<00:00, 66.4MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1fi9D8-UYu541eiHGDWugnat1Vavd9a3h
To: /content/label_encoders1.pkl
100%|██████████| 71.7M/71.7M [00:01<00:00, 51.6MB/s]


'label_encoders1.pkl'

In [3]:
num_ones = low_small['Is Laundering'].value_counts().get(1, 0)
num_zeros = low_small['Is Laundering'].value_counts().get(0, 0)

print(f"Número de 1s: {num_ones}")
print(f"Número de 0s: {num_zeros}")

Número de 1s: 3565
Número de 0s: 6920484


In [4]:
low_small['Timestamp'] = pd.to_datetime(low_small['Timestamp'])

low_small['Year'] = low_small['Timestamp'].dt.year
low_small['Month'] = low_small['Timestamp'].dt.month
low_small['Day'] = low_small['Timestamp'].dt.day
low_small['Hour'] = low_small['Timestamp'].dt.hour
low_small['Minute'] = low_small['Timestamp'].dt.minute

low_small.drop(columns=['Timestamp'], inplace=True)
low_small.rename(columns={'Account': 'Account2', 'Account.1': 'Account4'}, inplace=True)

In [5]:
with open('label_encoders1.pkl', 'rb') as file:
    label_encoders = pickle.load(file)

for col, le in label_encoders.items():
    if col in low_small.columns:
        low_small[col] = le.transform(low_small[col])

Se está realizando la separación de datos en variables dependientes  e independientes

In [6]:
X = low_small.drop('Is Laundering', axis=1)
y = low_small['Is Laundering']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creación del modelo XGBoost

In [7]:
model_xgb = xgb.XGBClassifier(random_state=42, scale_pos_weight=len(y_train) / sum(y_train))
model_xgb.fit(X_train, y_train)

Prueba del modelo con XGboost básico

In [8]:
y_pred = model_xgb.predict(X_test)
y_probs = model_xgb.predict_proba(X_test)[:, 1]

threshold = 0.5
y_pred_adjusted = (y_probs >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred_adjusted))
print("Classification Report:\n", classification_report(y_test, y_pred_adjusted))

conf_matrix = confusion_matrix(y_test, y_pred_adjusted)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9311075165546177
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96   1384099
           1       0.01      0.74      0.01       711

    accuracy                           0.93   1384810
   macro avg       0.50      0.83      0.49   1384810
weighted avg       1.00      0.93      0.96   1384810

Confusion Matrix:
 [[1288882   95217]
 [    186     525]]


Prueba del modelo ajustando hiperparámetros, umbrales y utilizando el GridSearchCV para encontrar el mejor modelo

In [9]:
param_grid = {
    'scale_pos_weight': [10, 20, 30, 40],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

xgb_model = xgb.XGBClassifier(random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='recall', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_probs = best_model.predict_proba(X_test)[:, 1]

threshold = 0.3
y_pred_adjusted = (y_probs >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred_adjusted)
class_report = classification_report(y_test, y_pred_adjusted)
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(class_report)
print("Confusion Matrix:")
print(conf_matrix)

Fitting 3 folds for each of 108 candidates, totalling 324 fits




Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'scale_pos_weight': 40}
Accuracy: 0.9948902737559665
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1384099
           1       0.04      0.34      0.06       711

    accuracy                           0.99   1384810
   macro avg       0.52      0.67      0.53   1384810
weighted avg       1.00      0.99      1.00   1384810

Confusion Matrix:
[[1377490    6609]
 [    467     244]]


Los resultados han mejorado un poco tras los ajustes del umbral, el problema que estamos perdiendo mas datos que en el modelo donde no tocabamos los umbrales.

En este tenemos menos falsos positivos pero perdemos mas datos y tenemos menos positivos

---

Prueba de un nuevo ajuste del modelo con RandomizedSearchCV para hacer una búsqueda de valores que arrojen un equilibrio entre precisión y recall

In [10]:
param_dist = {
    'scale_pos_weight': [30, 40, 50, 60, 70],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [200, 300, 400]
}

xgb_model = xgb.XGBClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                                   n_iter=20, scoring='recall', cv=3, verbose=2, n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

y_probs = best_model.predict_proba(X_test)[:, 1]
threshold = 0.25
y_pred_adjusted = (y_probs >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred_adjusted)
class_report = classification_report(y_test, y_pred_adjusted)
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(class_report)
print("Confusion Matrix:")
print(conf_matrix)

Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best Parameters: {'scale_pos_weight': 70, 'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.05}
Accuracy: 0.9826488832403001
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1384099
           1       0.02      0.54      0.03       711

    accuracy                           0.98   1384810
   macro avg       0.51      0.76      0.51   1384810
weighted avg       1.00      0.98      0.99   1384810

Confusion Matrix:
[[1360401   23698]
 [    330     381]]


Se crea un modelo con los resultados obtenidos como Best parameters:

Best Parameters:
 {'scale_pos_weight': 70, 'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.05}

In [11]:
model_xgb_optimized = xgb.XGBClassifier(
    scale_pos_weight=70,
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    random_state=42
)

model_xgb_optimized.fit(X_train, y_train)
y_pred = model_xgb_optimized.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9975122941053285
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1384099
           1       0.07      0.30      0.11       711

    accuracy                           1.00   1384810
   macro avg       0.53      0.65      0.55   1384810
weighted avg       1.00      1.00      1.00   1384810

Confusion Matrix:
 [[1381150    2949]
 [    496     215]]


Guardado del modelo en formato PKL

In [15]:
with open('model_xgb_optimized.pkl', 'wb') as file:
    pickle.dump(model_xgb_optimized, file)

files.download('model_xgb_optimized.pkl')

print("Modelo guardado y descargado como 'model_xgb_optimized.pkl'")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Modelo guardado y descargado como 'model_xgb_optimized.pkl'


---

### Prueba del modelo normal y el modificado sobre el dataset HI Small

Preparación de datos para aplicar los label encoders

In [16]:
hi_small['Timestamp'] = pd.to_datetime(hi_small['Timestamp'])

# Extraer Year, Month, Day, Hour, Minute
hi_small['Year'] = hi_small['Timestamp'].dt.year
hi_small['Month'] = hi_small['Timestamp'].dt.month
hi_small['Day'] = hi_small['Timestamp'].dt.day
hi_small['Hour'] = hi_small['Timestamp'].dt.hour
hi_small['Minute'] = hi_small['Timestamp'].dt.minute

hi_small.drop(columns=['Timestamp'], inplace=True)
hi_small.rename(columns={'Account': 'Account2', 'Account.1': 'Account4'}, inplace=True)

In [20]:
with open('label_encoders1.pkl', 'rb') as file:
    label_encoders = pickle.load(file)

# Transformar las columnas en el dataset hi_small
for col, le in label_encoders.items():
    if col in hi_small.columns:
        hi_small[col] = le.transform(hi_small[col])

In [21]:
X_hi = hi_small.drop('Is Laundering', axis=1)
y_hi = hi_small['Is Laundering']

Prueba del modelo XGboost sin modificar sobre el dataset HI_Small

In [22]:
y_hi_pred = model_xgb.predict(X_hi)

print("Accuracy on HI-Small:", accuracy_score(y_hi, y_hi_pred))
print("Classification Report on HI-Small:\n", classification_report(y_hi, y_hi_pred))

conf_matrix_hi = confusion_matrix(y_hi, y_hi_pred)
print("Matriz de Confusión:\n", conf_matrix_hi)

Accuracy on HI-Small: 0.9297192293946158
Classification Report on HI-Small:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96   5073168
           1       0.01      0.71      0.02      5177

    accuracy                           0.93   5078345
   macro avg       0.50      0.82      0.49   5078345
weighted avg       1.00      0.93      0.96   5078345

Matriz de Confusión:
 [[4717754  355414]
 [   1496    3681]]


Prueba del modelo XGBoost modificado sobre el dataset HI_Small

In [23]:
y_hi_pred = model_xgb_optimized.predict(X_hi)

print("Accuracy on HI-Small:", accuracy_score(y_hi, y_hi_pred))
print("Classification Report on HI-Small:\n", classification_report(y_hi, y_hi_pred))

conf_matrix_hi = confusion_matrix(y_hi, y_hi_pred)
print("Matriz de Confusión:\n", conf_matrix_hi)

Accuracy on HI-Small: 0.9964433294705263
Classification Report on HI-Small:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   5073168
           1       0.10      0.33      0.16      5177

    accuracy                           1.00   5078345
   macro avg       0.55      0.66      0.58   5078345
weighted avg       1.00      1.00      1.00   5078345

Matriz de Confusión:
 [[5058592   14576]
 [   3486    1691]]
