In [18]:
import pandas as pd
from joblib import Parallel, delayed

In [19]:
# Load the data

DATABASE_PATH = 'data/model_fraud.csv'
df = pd.read_csv(DATABASE_PATH)

In [20]:
# Copy of the original dataframe to compare the results

df_original = df.copy()


In [21]:
#ENRICHMENT
# Create a new column with the difference between the initial balance and the amount of the transaction

df['deltaOrigen'] = (df['saldoInicialOrigen'] -df['monto']).abs()

In [22]:
# ALGORITHM FOR RETIROS
# first rule: if the transaction is a retiro, check if there is a transfer with the same amount in the last 3 time units

def check_for_retiro(row, df:pd.DataFrame  ):
    tiempo = row['unidadTiempo'] 
    tipo_transaccion=row['tipoTransaccion']
    monto_transaccion = row['monto']
    saldo_inicial_origen = row['saldoInicialOrigen']
    
    if tipo_transaccion == 'Retiro':    
        df = df[df['tipoTransaccion'] == 'Transferencia']
        df = df[(df['unidadTiempo'] == tiempo) | (df['unidadTiempo'] == tiempo-1) | (df['unidadTiempo'] == tiempo-2)]
        df = df[df['monto'] == monto_transaccion]
        
        if len(df) > 0:
            return 1
        
    return 0

        

In [24]:
# ALGORITHM FOR TRANSFERENCIAS
# first rule: if the transaction is a transferencia, check if the deltaOrigen is 0 

def check_for_transferencia(row):
    tipo_transaccion=row['tipoTransaccion']
    delta_origen = row['deltaOrigen']
    saldo_inicial_destinatario = row['saldoInicialDestinatario']
    saldo_final_destinatario = row['saldoFinalDestinatario']
    saldo_final_origen = row['saldoFinalOrigen']
    
    
    if tipo_transaccion == 'Transferencia':
        if delta_origen == 0:
            #if saldo_inicial_destinatario == 0 and saldo_final_destinatario == 0 and saldo_final_origen == 0:
                return 1
    
    return 0

In [25]:
# NEW COLUMN WITH THE RESULTS OF THE ALGORITHMS

df['marca_fraude_proyectada'] = 0

In [36]:
import dask.dataframe as dd
from dask.distributed import Client

# Crear un cliente Dask para distribuir el procesamiento
client = Client()

# Convertir el DataFrame de Pandas a un DataFrame de Dask
ddf = dd.from_pandas(df, npartitions=100)  # Puedes ajustar el número de particiones según el tamaño de tu DataFrame

# Aplicar la función check_for_transferencia a cada fila en paralelo
ddf['marca_fraude_proyectada'] = ddf.apply(lambda row: check_for_transferencia(row), axis=1, meta=('marca_fraude_proyectada', 'float'))

# Calcular el número total de fraudes proyectados para Transferencias
total_frauds = ddf['marca_fraude_proyectada'].sum().compute()

# Filtrar las discrepancias para Transferencias
df_discrepancias_transferencia = ddf[(ddf['marca_fraude'] != ddf['marca_fraude_proyectada']) & (ddf['tipoTransaccion'] == 'Transferencia')]

# Imprimir resultados
print('Número de fraudes proyectados para Transferencias:', total_frauds)
print('Número de discrepancias para Transferencias:', len(df_discrepancias_transferencia))

# Cerrar el cliente

Perhaps you already have a cluster running?
Hosting the HTTP server on port 57928 instead


KeyboardInterrupt: 

In [17]:
# Run the algorithms for 'Transferencia' for each row in the dataframe

i = 0
for index, row in df.iterrows():
    marca: float = check_for_transferencia(row)
    df.at[index, 'marca_fraude_proyectada'] = marca
    i += marca

df_discrepancias_transferencia = df[(df['marca_fraude'] != df['marca_fraude_proyectada']) & (df['tipoTransaccion'] == 'Transferencia')]    

print('number of frauds projected for Transferencias:', i)   
print('number of discrepancies for Transferencias:', len(df_discrepancias_transferencia))

KeyboardInterrupt: 

In [9]:
# Run the algorithms for 'Retiro' for each row in the dataframe

i = 0
for index, row in df.iterrows():
    marca = check_for_retiro(row, df )
    df.at[index, 'marca_fraude_proyectada'] = marca
    i += marca
     
df_discrepancias_retiro = df[(df['marca_fraude'] != df['marca_fraude_proyectada']) & (df['tipoTransaccion'] == 'Retiro')] 

print('number of frauds projected for Retiros:', i)
print('number of discrepancies for Retiros:', len(df_discrepancias_retiro))

number of frauds projected for Retiros: 33
number of discrepancies for Retiros: 2
