# Proyecto Final - Security Data Science

## Oscar Fernando López Barrios
## Carné 20679

In [24]:
# Importar librerias necesarias
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE

In [25]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Asignar el documento con las variables realizadas
df = pd.read_csv('/content/drive/My Drive/SDS/Proyecto-Final-SDS/fraud_detection_dataset.csv')

In [27]:
# Mostrar dataframe
df

Unnamed: 0,amt,is_fraud,amt_month_shopping_net_spend,count_month_shopping_net,first_time_at_merchant,hour,minute,second,day,month,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,7.27,0,0.00,0.0,True,12,47,15,1,1,...,0,0,0,0,1,0,0,0,0,0
1,52.94,0,0.00,0.0,True,8,44,57,2,1,...,0,0,0,0,0,0,0,0,0,0
2,82.08,0,0.00,0.0,True,8,47,36,2,1,...,0,0,0,0,0,0,0,0,0,0
3,34.79,0,0.00,0.0,True,12,38,14,2,1,...,0,0,0,1,0,0,0,0,0,0
4,27.18,0,0.00,0.0,True,13,10,46,2,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,66.11,0,160.56,11.0,False,2,10,10,31,12,...,0,0,0,0,0,0,0,0,0,0
1852390,4.58,0,160.56,11.0,False,5,13,35,31,12,...,0,0,0,0,1,0,0,0,0,0
1852391,95.96,0,160.56,11.0,False,11,15,44,31,12,...,0,0,0,0,0,0,0,0,0,0
1852392,149.48,0,160.56,11.0,False,11,17,25,31,12,...,1,0,0,0,0,0,0,0,0,0


In [28]:
# Mostrar los tipos
df.dtypes

amt                             float64
is_fraud                          int64
amt_month_shopping_net_spend    float64
count_month_shopping_net        float64
first_time_at_merchant             bool
                                 ...   
category_misc_pos                 int64
category_personal_care            int64
category_shopping_net             int64
category_shopping_pos             int64
category_travel                   int64
Length: 64, dtype: object

In [29]:
# Mostrar el conteo
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

is_fraud
0    1842743
1       9651
Name: count, dtype: int64


In [30]:
# Eliminar los valores nulos
df.fillna(0, inplace=True)

In [19]:
# Separar características y objetivo
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

## Entrenamiento Incremental

In [20]:
# Función para entrenar el modelo por un período de tiempo específico y devolver métricas de rendimiento
def train_lightgbm(X_train, y_train, X_test, y_test, model=None):
    # Escalar las características para mejorar el rendimiento del modelo
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Aplicar SMOTE
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

    # Crear el dataset de LightGBM
    train_data = lgb.Dataset(X_train_res, label=y_train_res)
    valid_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)

    # Parámetros del modelo
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1
    }

    # Callbacks para early stopping
    callbacks = [lgb.early_stopping(stopping_rounds=10, verbose=False)]

    # Entrenar el modelo
    if model is None:
        model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[valid_data], callbacks=callbacks)
    else:
        model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[valid_data], init_model=model, callbacks=callbacks)

    # Predecir en el conjunto de prueba
    y_pred_proba = model.predict(X_test_scaled, num_iteration=model.best_iteration)
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Calcular métricas de rendimiento
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    return model, accuracy, precision, recall, f1, roc_auc, conf_matrix

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Variables para almacenar las métricas acumuladas
total_accuracy = 0
total_precision = 0
total_recall = 0
total_f1 = 0
total_roc_auc = 0
confusion_matrices = []

# Contador de períodos
period_count = 0

# Inicializar el modelo
model = None

# Loop para entrenar de manera incremental por cada 3 meses
for year in range(df['year'].min(), df['year'].max() + 1):
    for month in range(1, 13, 3):  # Entrenar cada 3 meses
        # Filtrar los datos para el año y mes actual
        X_train_period = X_train_full[(X_train_full['year'] == year) & (X_train_full['month'].between(month, month + 2))]
        y_train_period = y_train_full[X_train_period.index]

        # Asegurarse de que no hay datos vacíos
        if len(X_train_period) == 0:
            continue

        # Entrenar el modelo LightGBM para este período de tiempo
        print(f"Training for year {year}, month {month}-{month + 2}")
        model, accuracy, precision, recall, f1, roc_auc, conf_matrix = train_lightgbm(X_train_period, y_train_period, X_test, y_test, model)

        # Acumular métricas
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_roc_auc += roc_auc
        confusion_matrices.append(conf_matrix)

        # Incrementar el contador de períodos
        period_count += 1

        # Imprimir métricas para este período
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC Score: {roc_auc:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

# Calcular las métricas finales promedio
final_accuracy = total_accuracy / period_count
final_precision = total_precision / period_count
final_recall = total_recall / period_count
final_f1 = total_f1 / period_count
final_roc_auc = total_roc_auc / period_count

# Sumar las matrices de confusión
final_confusion_matrix = np.sum(confusion_matrices, axis=0)

# Imprimir las métricas finales
print("\nFinal Metrics:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1:.4f}")
print(f"ROC AUC Score: {final_roc_auc:.4f}")
print(f"Confusion Matrix:\n{final_confusion_matrix}")

Training for year 2019, month 1-3
Accuracy: 0.9979
Precision: 0.7611
Recall: 0.8508
F1 Score: 0.8034
ROC AUC Score: 0.9968
Confusion Matrix:
[[552109    761]
 [   425   2424]]
Training for year 2019, month 4-6
Accuracy: 0.9984
Precision: 0.8288
Recall: 0.8736
F1 Score: 0.8506
ROC AUC Score: 0.9986
Confusion Matrix:
[[552356    514]
 [   360   2489]]
Training for year 2019, month 7-9
Accuracy: 0.9986
Precision: 0.8515
Recall: 0.8754
F1 Score: 0.8633
ROC AUC Score: 0.9976
Confusion Matrix:
[[552435    435]
 [   355   2494]]
Training for year 2019, month 10-12
Accuracy: 0.9924
Precision: 0.3884
Recall: 0.8347
F1 Score: 0.5301
ROC AUC Score: 0.9689
Confusion Matrix:
[[549125   3745]
 [   471   2378]]
Training for year 2020, month 1-3
Accuracy: 0.9929
Precision: 0.4064
Recall: 0.8326
F1 Score: 0.5462
ROC AUC Score: 0.9804
Confusion Matrix:
[[549406   3464]
 [   477   2372]]
Training for year 2020, month 4-6
Accuracy: 0.9712
Precision: 0.1369
Recall: 0.8691
F1 Score: 0.2365
ROC AUC Score: 0.

## Entrenamiento Completo

In [22]:
# División de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balanceo de datos en el conjunto de entrenamiento utilizando SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Creación del dataset de LightGBM
train_data = lgb.Dataset(X_train_res, label=y_train_res)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Parámetros del modelo LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Entrenamiento del modelo LightGBM
model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[valid_data])

# Predicción en el conjunto de prueba
X_test_scaled = scaler.transform(X_test)
y_pred_proba = model.predict(X_test_scaled, num_iteration=model.best_iteration)
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluación del modelo
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

# Calcular curva precision-recall y área bajo la curva (AUC)
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_curve, precision_curve)

# Imprimir métricas de rendimiento
print("Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Metrics:
Accuracy: 0.9951
Precision: 0.5137
Recall: 0.9800
F1 Score: 0.6741
ROC AUC Score: 0.9989
PR AUC Score: 0.9573
Confusion Matrix:
[[550227   2643]
 [    57   2792]]
