## Extraer y explorar el dataset

In [78]:
# Importar librerías
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn import preprocessing
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path 

In [3]:
# Descargar el dataset de kaggle desde la URL de origen del dataset (https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022). Importante tener la api de kaggle para que te permite descargar.
destination_folder = '../data/raw'
#Asegurar que la carpeta destino existe
os.makedirs(destination_folder, exist_ok=True)

os.system('kaggle datasets download -d "sgpjesus/bank-account-fraud-dataset-neurips-2022" -p "../data/raw" && unzip ../data/raw/bank-account-fraud-dataset-neurips-2022.zip -d "../data/raw" && rm ../data/raw/bank-account-fraud-dataset-neurips-2022.zip')

Dataset URL: https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022
License(s): CC-BY-NC-SA-4.0
Downloading bank-account-fraud-dataset-neurips-2022.zip to ../data/raw


 50%|████▉     | 265M/532M [12:25<13:43, 340kB/s]  

In [61]:
# Usaremos el dataset Base.csv para el experimento
pd.options.display.max_columns = None
df = pd.read_csv('../data/raw/Base.csv')
df.head(5)

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,13096.035018,7850.955007,6742.080561,5,5,CB,163,1,BC,0,1,9,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,9223.283431,5745.251481,5941.664859,3,18,CA,154,1,BC,1,1,2,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,4471.472149,5471.988958,5992.555113,15,11,CA,89,1,BC,0,1,30,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,14431.993621,6755.344479,5970.336831,11,13,CA,90,1,BC,0,1,1,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,7601.511579,5124.04693,5940.734212,1,6,CA,91,0,BC,1,1,26,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


In [62]:
# verificamos la cant de filas y columnas
print("Cantidad de filas: {}".format(df.shape[0]))
print("Cantidad de Columnas: {}".format(df.shape[1]))

Cantidad de filas: 1000000
Cantidad de Columnas: 32


In [63]:
df.dtypes

fraud_bool                            int64
income                              float64
name_email_similarity               float64
prev_address_months_count             int64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                 

Los valores de las columnas son de distintos tipos, veamos en seguida en el preprocesamiento como podemos solucionar este problema

## Preprocesamiento

In [64]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

Se puede observar que no posee valores nulos a simple vista, pero tenemos que tener en cuenta que en la definición del conjunto de datos como los autores han definido los valores nulos son representados por "-1", en algunas columnas. Para más información consultar https://github.com/feedzai/bank-account-fraud/blob/main/documents/datasheet.pdf

In [65]:
# Obteniendo la cantidad de memoria usada en cada columna en megabyte
df.memory_usage(deep=True) * 1e-6

Index                                0.000132
fraud_bool                           8.000000
income                               8.000000
name_email_similarity                8.000000
prev_address_months_count            8.000000
current_address_months_count         8.000000
customer_age                         8.000000
days_since_request                   8.000000
intended_balcon_amount               8.000000
payment_type                        51.000000
zip_count_4w                         8.000000
velocity_6h                          8.000000
velocity_24h                         8.000000
velocity_4w                          8.000000
bank_branch_count_8w                 8.000000
date_of_birth_distinct_emails_4w     8.000000
employment_status                   51.000000
credit_risk_score                    8.000000
email_is_free                        8.000000
housing_status                      51.000000
phone_home_valid                     8.000000
phone_mobile_valid                

In [66]:
for col in df.columns:
    df[col] = df[col].replace(-1, np.nan)
    if col == "intended_balcon_amount":
        df[col] = df[col].apply(lambda x: np.nan if x < 0 else x)
    number_MV = df[col].isna().sum()
    if number_MV > 0:
        print('{}: {}'.format(col, number_MV))



prev_address_months_count: 712920
current_address_months_count: 4254
intended_balcon_amount: 742523
credit_risk_score: 488
bank_months_count: 253635
session_length_in_minutes: 2015
device_distinct_emails_8w: 359


Ya podemos ver los atributos que faltan valores, itentaremos rellenar esos valores aplicando técnicas de imputación por la mediana. En los casos en que la cantidad de valores en falta sea más del 60% ( "prev_address_months_count" y "intended_balcon_amount" ), mejor optamos por eliminar este atributo para que no influya en el sobreajuste de los datos a la hora de rellenarlo.

In [67]:
# Eliminar las variables "prev_address_months_count" y "intended_balcon_amount" del conjunto de datos
df.drop(columns=['prev_address_months_count','intended_balcon_amount'],inplace=True)
Y = df['fraud_bool']
df.drop(columns=['fraud_bool'],inplace=True)


Ahora vamos a rellenar los NAN con la mediana que corresponde a cada columna

In [68]:
# Identificar las columnas númericas
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# filtrar las columnas con valores nan
nan_columns = [col for col in numerical_cols if df[col].isna().any()]
for col in nan_columns:
    df[col] = df[col].fillna(df[col].median())

In [69]:
for col in df.columns:
    df[col] = df[col].replace(-1, np.nan)
    if col == "intended_balcon_amount":
        df[col] = df[col].apply(lambda x: np.nan if x < 0 else x)
    number_MV = df[col].isna().sum()
    if number_MV > 0:
        print('{}: {}'.format(col, number_MV))
    else:
        print("Todas las columnas tienen valores")
        break

Todas las columnas tienen valores


Vamos hacer la transformación de las variables categóricas con el método one_hot_encoding de sklearn

In [70]:
label_encoder = preprocessing.LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col]= label_encoder.fit_transform(df[col])
    print("Transformación de los valores {}".format({l: i for i, l in enumerate(label_encoder.classes_)}))
    


Transformación de los valores {'AA': 0, 'AB': 1, 'AC': 2, 'AD': 3, 'AE': 4}
Transformación de los valores {'CA': 0, 'CB': 1, 'CC': 2, 'CD': 3, 'CE': 4, 'CF': 5, 'CG': 6}
Transformación de los valores {'BA': 0, 'BB': 1, 'BC': 2, 'BD': 3, 'BE': 4, 'BF': 5, 'BG': 6}
Transformación de los valores {'INTERNET': 0, 'TELEAPP': 1}
Transformación de los valores {'linux': 0, 'macintosh': 1, 'other': 2, 'windows': 3, 'x11': 4}


In [71]:
df.head(5)

Unnamed: 0,income,name_email_similarity,current_address_months_count,customer_age,days_since_request,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.3,0.986506,25.0,40,0.006735,0,1059,13096.035018,7850.955007,6742.080561,5,5,1,163.0,1,2,0,1,9.0,0,1500.0,0,0,16.224843,0,1,1.0,0,0
1,0.8,0.617426,89.0,20,0.010095,3,1658,9223.283431,5745.251481,5941.664859,3,18,0,154.0,1,2,1,1,2.0,0,1500.0,0,0,3.363854,2,1,1.0,0,0
2,0.8,0.996707,14.0,40,0.012316,1,1095,4471.472149,5471.988958,5992.555113,15,11,0,89.0,1,2,0,1,30.0,0,200.0,0,0,22.730559,3,0,1.0,0,0
3,0.6,0.4751,14.0,30,0.006991,1,3483,14431.993621,6755.344479,5970.336831,11,13,0,90.0,1,2,0,1,1.0,0,200.0,0,0,15.215816,0,1,1.0,0,0
4,0.9,0.842307,29.0,40,5.742626,0,2339,7601.511579,5124.04693,5940.734212,1,6,0,91.0,0,2,1,1,26.0,0,200.0,0,0,3.743048,2,0,1.0,0,0


Ahora que tenemos las columnas con los valores númericos vamos a normalizar esos valores

In [72]:
scaler = RobustScaler()
X = scaler.fit_transform(df)


In [34]:
X.dtype

dtype('float64')

In [74]:
# pasar los datos de vuelta al formato csv.
preprocess_df = pd.DataFrame(X,columns=df.columns)

In [75]:
Y

0         0
1         0
2         0
3         0
4         0
         ..
999995    0
999996    0
999997    0
999998    0
999999    0
Name: fraud_bool, Length: 1000000, dtype: int64

In [76]:
preprocess_df.insert(29, "fraud_class", Y, True)

In [77]:
preprocess_df.head(5)

Unnamed: 0,income,name_email_similarity,current_address_months_count,customer_age,days_since_request,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month,fraud_class
0,-0.6,0.932126,-0.254545,0.5,-0.441039,-0.5,-0.194286,1.832144,1.436066,1.499239,-0.166667,-0.571429,1.0,0.431579,0.0,0.0,0.0,0.0,-0.3,0.0,4.333333,0.0,0.0,1.932912,-0.666667,0.0,0.0,0.0,-0.75,0
1,0.4,0.236209,0.327273,-0.5,-0.265482,1.0,0.37619,0.919696,0.46093,0.843007,-0.25,1.285714,0.0,0.336842,0.0,0.0,1.0,0.0,-0.65,0.0,4.333333,0.0,0.0,-0.306246,0.0,0.0,0.0,0.0,-0.75,0
2,0.4,0.95136,-0.354545,0.5,-0.149413,0.0,-0.16,-0.199865,0.334384,0.88473,0.25,0.285714,0.0,-0.347368,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,3.065588,0.333333,-1.0,0.0,0.0,-0.75,0
3,0.0,-0.032153,-0.354545,0.0,-0.427691,0.0,2.114286,2.146906,0.928697,0.866514,0.083333,0.571429,0.0,-0.336842,0.0,0.0,0.0,0.0,-0.7,0.0,0.0,0.0,0.0,1.757236,-0.666667,0.0,0.0,0.0,-0.75,0
4,0.6,0.660231,-0.218182,0.5,299.279827,-0.5,1.024762,0.537595,0.173255,0.842244,-0.333333,-0.428571,0.0,-0.326316,-1.0,0.0,1.0,0.0,0.55,0.0,0.0,0.0,0.0,-0.240227,0.0,-1.0,0.0,0.0,-0.75,0


In [79]:
filepath = Path('../data/Preprocessed/preprocessed_data.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
preprocess_df.to_csv(filepath)