# Proyecto Final - Security Data Science

## Oscar Fernando López Barrios
## Carné 20679

In [11]:
# Importar librerias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [12]:
# Elegir la GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [13]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# Asignar el documento con las variables realizadas
df = pd.read_csv('/content/drive/My Drive/SDS/Proyecto-Final-SDS/fraud_detection_dataset.csv')

In [15]:
# Mostrar dataframe
df

Unnamed: 0,amt,is_fraud,amt_month_shopping_net_spend,count_month_shopping_net,first_time_at_merchant,hour,minute,second,day,month,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,7.27,0,0.00,0.0,True,12,47,15,1,1,...,0,0,0,0,1,0,0,0,0,0
1,52.94,0,0.00,0.0,True,8,44,57,2,1,...,0,0,0,0,0,0,0,0,0,0
2,82.08,0,0.00,0.0,True,8,47,36,2,1,...,0,0,0,0,0,0,0,0,0,0
3,34.79,0,0.00,0.0,True,12,38,14,2,1,...,0,0,0,1,0,0,0,0,0,0
4,27.18,0,0.00,0.0,True,13,10,46,2,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,66.11,0,160.56,11.0,False,2,10,10,31,12,...,0,0,0,0,0,0,0,0,0,0
1852390,4.58,0,160.56,11.0,False,5,13,35,31,12,...,0,0,0,0,1,0,0,0,0,0
1852391,95.96,0,160.56,11.0,False,11,15,44,31,12,...,0,0,0,0,0,0,0,0,0,0
1852392,149.48,0,160.56,11.0,False,11,17,25,31,12,...,1,0,0,0,0,0,0,0,0,0


In [16]:
# Mostrar los tipos
df.dtypes

amt                             float64
is_fraud                          int64
amt_month_shopping_net_spend    float64
count_month_shopping_net        float64
first_time_at_merchant             bool
                                 ...   
category_misc_pos                 int64
category_personal_care            int64
category_shopping_net             int64
category_shopping_pos             int64
category_travel                   int64
Length: 64, dtype: object

In [17]:
# Mostrar el conteo
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

is_fraud
0    1842743
1       9651
Name: count, dtype: int64


In [18]:
# Eliminar los valores nulos
df.fillna(0, inplace=True)

In [9]:
# Separar características y objetivo
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

In [10]:
def train_neural_network(X_train, y_train, X_test, y_test, model=None):
    # Escalar las características para mejorar el rendimiento del modelo
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Aplicar SMOTE
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

    # Definir el modelo de red neuronal
    if model is None:
        model = Sequential([
            Dense(256, activation='relu', input_shape=(X_train_res.shape[1],)),
            BatchNormalization(),
            Dropout(0.5),
            Dense(128, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),
            Dense(64, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),
            Dense(32, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        # Compilar el modelo
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

    # Definir el callback de detención temprana
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Entrenar el modelo
    model.fit(X_train_res, y_train_res, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

    # Evaluar el modelo en el conjunto de prueba
    y_pred_proba = model.predict(X_test_scaled)
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Calcular métricas de rendimiento
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    return model, accuracy, precision, recall, f1, roc_auc, conf_matrix

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Variables para almacenar las métricas acumuladas
total_accuracy = 0
total_precision = 0
total_recall = 0
total_f1 = 0
total_roc_auc = 0
confusion_matrices = []

# Contador de períodos
period_count = 0

# Inicializar el modelo
model = None

# Loop para entrenar de manera incremental por cada 3 meses
for year in range(df['year'].min(), df['year'].max() + 1):
    for month in range(1, 13, 3):  # Entrenar cada 3 meses
        # Filtrar los datos para el año y mes actual
        X_train_period = X_train_full[(X_train_full['year'] == year) & (X_train_full['month'].between(month, month + 2))]
        y_train_period = y_train_full[X_train_period.index]

        # Asegurarse de que no hay datos vacíos
        if len(X_train_period) == 0:
            continue

        # Entrenar la red neuronal para este período de tiempo
        print(f"Training for year {year}, month {month}-{month + 2}")
        model, accuracy, precision, recall, f1, roc_auc, conf_matrix = train_neural_network(X_train_period, y_train_period, X_test, y_test, model)

        # Acumular métricas
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_roc_auc += roc_auc
        confusion_matrices.append(conf_matrix)

        # Incrementar el contador de períodos
        period_count += 1

        # Imprimir métricas para este período
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC Score: {roc_auc:.4f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

# Calcular las métricas finales promedio
final_accuracy = total_accuracy / period_count
final_precision = total_precision / period_count
final_recall = total_recall / period_count
final_f1 = total_f1 / period_count
final_roc_auc = total_roc_auc / period_count

# Sumar las matrices de confusión
final_confusion_matrix = np.sum(confusion_matrices, axis=0)

# Imprimir las métricas finales
print("\nFinal Metrics:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1:.4f}")
print(f"ROC AUC Score: {final_roc_auc:.4f}")
print(f"Confusion Matrix:\n{final_confusion_matrix}")

Training for year 2019, month 1-3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9963
Precision: 0.9102
Recall: 0.3096
F1 Score: 0.4620
ROC AUC Score: 0.7322
Confusion Matrix:
[[552783     87]
 [  1967    882]]
Training for year 2019, month 4-6
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Accuracy: 0.9974
Precision: 0.9076
Recall: 0.5553
F1 Score: 0.6890
ROC AUC Score: 0.8877
Confusion Matrix:
[[552709    161]
 [  1267   1582]]
Training for year 2019, month 7-9
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9977
Precision: 0.9549
Recall: 0.5792
F1 Score: 0.7210
ROC AUC Score: 0.9263
Confusion Matrix:
[[552792     78]
 [  1199   1650]]
Training for year 2019, month 10-12
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Accuracy: 0.9977
Precision: 0.9294
Recall: 0.5911